LLVM OpenMP* Runtime Library
kmp_stats.h
1#ifndef KMP_STATS_H
2#define KMP_STATS_H
3
8//===----------------------------------------------------------------------===//
9//
10// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11// See https://llvm.org/LICENSE.txt for license information.
12// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13//
14//===----------------------------------------------------------------------===//
15
16#include "kmp_config.h"
17#include "kmp_debug.h"
18
19#if KMP_STATS_ENABLED
20/* Statistics accumulator.
21 Accumulates number of samples and computes min, max, mean, standard deviation
22 on the fly.
23
24 Online variance calculation algorithm from
25 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26 */
27
28#include "kmp_stats_timing.h"
29#include <limits>
30#include <math.h>
31#include <new> // placement new
32#include <stdint.h>
33#include <string>
34#include <vector>
35
36/* Enable developer statistics here if you want them. They are more detailed
37 than is useful for application characterisation and are intended for the
38 runtime library developer. */
39#define KMP_DEVELOPER_STATS 0
40
41/* Enable/Disable histogram output */
42#define KMP_STATS_HIST 0
43
50 noTotal = 1 << 0,
51 onlyInMaster = 1 << 1,
52 noUnits = 1 << 2,
53 notInMaster = 1 << 3,
54 logEvent = 1 << 4
56};
57
64 IDLE,
65 SERIAL_REGION,
66 FORK_JOIN_BARRIER,
67 PLAIN_BARRIER,
68 TASKWAIT,
69 TASKYIELD,
70 TASKGROUP,
71 IMPLICIT_TASK,
72 EXPLICIT_TASK,
73 TEAMS_REGION
74};
75
94// clang-format off
95#define KMP_FOREACH_COUNTER(macro, arg) \
96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97 macro(OMP_NESTED_PARALLEL, 0, arg) \
98 macro(OMP_LOOP_STATIC, 0, arg) \
99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100 macro(OMP_LOOP_DYNAMIC, 0, arg) \
101 macro(OMP_DISTRIBUTE, 0, arg) \
102 macro(OMP_BARRIER, 0, arg) \
103 macro(OMP_CRITICAL, 0, arg) \
104 macro(OMP_SINGLE, 0, arg) \
105 macro(OMP_MASTER, 0, arg) \
106 macro(OMP_MASKED, 0, arg) \
107 macro(OMP_TEAMS, 0, arg) \
108 macro(OMP_set_lock, 0, arg) \
109 macro(OMP_test_lock, 0, arg) \
110 macro(REDUCE_wait, 0, arg) \
111 macro(REDUCE_nowait, 0, arg) \
112 macro(OMP_TASKYIELD, 0, arg) \
113 macro(OMP_TASKLOOP, 0, arg) \
114 macro(TASK_executed, 0, arg) \
115 macro(TASK_cancelled, 0, arg) \
116 macro(TASK_stolen, 0, arg)
117// clang-format on
118
137// clang-format off
138#define KMP_FOREACH_TIMER(macro, arg) \
139 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
140 macro (OMP_parallel, stats_flags_e::logEvent, arg) \
141 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
142 macro (OMP_teams, stats_flags_e::logEvent, arg) \
143 macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
144 macro (OMP_loop_static, 0, arg) \
145 macro (OMP_loop_static_scheduling, 0, arg) \
146 macro (OMP_loop_dynamic, 0, arg) \
147 macro (OMP_loop_dynamic_scheduling, 0, arg) \
148 macro (OMP_distribute, 0, arg) \
149 macro (OMP_distribute_scheduling, 0, arg) \
150 macro (OMP_critical, 0, arg) \
151 macro (OMP_critical_wait, 0, arg) \
152 macro (OMP_single, 0, arg) \
153 macro (OMP_master, 0, arg) \
154 macro (OMP_masked, 0, arg) \
155 macro (OMP_task_immediate, 0, arg) \
156 macro (OMP_task_taskwait, 0, arg) \
157 macro (OMP_task_taskyield, 0, arg) \
158 macro (OMP_task_taskgroup, 0, arg) \
159 macro (OMP_task_join_bar, 0, arg) \
160 macro (OMP_task_plain_bar, 0, arg) \
161 macro (OMP_taskloop_scheduling, 0, arg) \
162 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
163 macro (OMP_idle, stats_flags_e::logEvent, arg) \
164 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
165 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
166 macro (OMP_serial, stats_flags_e::logEvent, arg) \
167 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
168 arg) \
169 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
170 arg) \
171 macro (OMP_loop_static_iterations, \
172 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
173 macro (OMP_loop_static_total_iterations, \
174 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
175 macro (OMP_loop_dynamic_iterations, \
176 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
177 macro (OMP_loop_dynamic_total_iterations, \
178 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
179 macro (OMP_distribute_iterations, \
180 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
181 KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
182// clang-format on
183
184// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
185// initializing OpenMP or being created by a primary
186// thread) until the thread is destroyed
187// OMP_parallel -- Time thread spends executing work directly
188// within a #pragma omp parallel
189// OMP_parallel_overhead -- Time thread spends setting up a parallel region
190// OMP_loop_static -- Time thread spends executing loop iterations from
191// a statically scheduled loop
192// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
193// from a statically scheduled loop
194// OMP_loop_dynamic -- Time thread spends executing loop iterations from
195// a dynamically scheduled loop
196// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
197// from a dynamically scheduled loop
198// OMP_critical -- Time thread spends executing critical section
199// OMP_critical_wait -- Time thread spends waiting to enter
200// a critical section
201// OMP_single -- Time spent executing a "single" region
202// OMP_master -- Time spent executing a "master" region
203// OMP_masked -- Time spent executing a "masked" region
204// OMP_task_immediate -- Time spent executing non-deferred tasks
205// OMP_task_taskwait -- Time spent executing tasks inside a taskwait
206// construct
207// OMP_task_taskyield -- Time spent executing tasks inside a taskyield
208// construct
209// OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
210// construct
211// OMP_task_join_bar -- Time spent executing tasks inside a join barrier
212// OMP_task_plain_bar -- Time spent executing tasks inside a barrier
213// construct
214// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
215// construct
216// OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
217// inside implicit barrier at end of worksharing
218// construct
219// OMP_idle -- Time worker threads spend waiting for next
220// parallel region
221// OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
222// parallel region
223// OMP_join_barrier -- Time spent in a the join barrier surrounding a
224// parallel region
225// OMP_serial -- Time thread zero spends executing serial code
226// OMP_set_numthreads -- Values passed to omp_set_num_threads
227// OMP_PARALLEL_args -- Number of arguments passed to a parallel region
228// OMP_loop_static_iterations -- Number of iterations thread is assigned for
229// statically scheduled loops
230// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
231// dynamically scheduled loops
232
233#if (KMP_DEVELOPER_STATS)
234// Timers which are of interest to runtime library developers, not end users.
235// These have to be explicitly enabled in addition to the other stats.
236
237// KMP_fork_barrier -- time in __kmp_fork_barrier
238// KMP_join_barrier -- time in __kmp_join_barrier
239// KMP_barrier -- time in __kmp_barrier
240// KMP_end_split_barrier -- time in __kmp_end_split_barrier
241// KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
242// KMP_icv_copy -- start/stop timer for any ICV copying
243// KMP_linear_gather -- time in __kmp_linear_barrier_gather
244// KMP_linear_release -- time in __kmp_linear_barrier_release
245// KMP_tree_gather -- time in __kmp_tree_barrier_gather
246// KMP_tree_release -- time in __kmp_tree_barrier_release
247// KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
248// KMP_hyper_release -- time in __kmp_hyper_barrier_release
249// clang-format off
250#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
251 macro(KMP_fork_call, 0, arg) \
252 macro(KMP_join_call, 0, arg) \
253 macro(KMP_end_split_barrier, 0, arg) \
254 macro(KMP_hier_gather, 0, arg) \
255 macro(KMP_hier_release, 0, arg) \
256 macro(KMP_hyper_gather, 0, arg) \
257 macro(KMP_hyper_release, 0, arg) \
258 macro(KMP_linear_gather, 0, arg) \
259 macro(KMP_linear_release, 0, arg) \
260 macro(KMP_tree_gather, 0, arg) \
261 macro(KMP_tree_release, 0, arg) \
262 macro(USER_resume, 0, arg) \
263 macro(USER_suspend, 0, arg) \
264 macro(USER_mwait, 0, arg) \
265 macro(KMP_allocate_team, 0, arg) \
266 macro(KMP_setup_icv_copy, 0, arg) \
267 macro(USER_icv_copy, 0, arg) \
268 macro (FOR_static_steal_stolen, \
269 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
270 macro (FOR_static_steal_chunks, \
271 stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
272#else
273#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
274#endif
275// clang-format on
276
296#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
297
298#define ENUMERATE(name, ignore, prefix) prefix##name,
299enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
300
301enum explicit_timer_e {
302 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
303};
304
305enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
306#undef ENUMERATE
307
308/*
309 * A logarithmic histogram. It accumulates the number of values in each power of
310 * ten bin. So 1<=x<10, 10<=x<100, ...
311 * Mostly useful where we have some big outliers and want to see information
312 * about them.
313 */
314class logHistogram {
315 enum {
316 numBins = 31, /* Number of powers of 10. If this changes you need to change
317 * the initializer for binMax */
318
319 /*
320 * If you want to use this to analyse values that may be less than 1, (for
321 * instance times in s), then the logOffset gives you negative powers.
322 * In our case here, we're just looking at times in ticks, or counts, so we
323 * can never see values with magnitude < 1 (other than zero), so we can set
324 * it to 0. As above change the initializer if you change this.
325 */
326 logOffset = 0
327 };
328 uint32_t KMP_ALIGN_CACHE zeroCount;
329 struct {
330 uint32_t count;
331 double total;
332 } bins[numBins];
333
334 static double binMax[numBins];
335
336#ifdef KMP_DEBUG
337 uint64_t _total;
338
339 void check() const {
340 uint64_t t = zeroCount;
341 for (int i = 0; i < numBins; i++)
342 t += bins[i].count;
343 KMP_DEBUG_ASSERT(t == _total);
344 }
345#else
346 void check() const {}
347#endif
348
349public:
350 logHistogram() { reset(); }
351
352 logHistogram(logHistogram const &o) {
353 for (int i = 0; i < numBins; i++)
354 bins[i] = o.bins[i];
355#ifdef KMP_DEBUG
356 _total = o._total;
357#endif
358 }
359
360 void reset() {
361 zeroCount = 0;
362 for (int i = 0; i < numBins; i++) {
363 bins[i].count = 0;
364 bins[i].total = 0;
365 }
366
367#ifdef KMP_DEBUG
368 _total = 0;
369#endif
370 }
371 uint32_t count(int b) const { return bins[b + logOffset].count; }
372 double total(int b) const { return bins[b + logOffset].total; }
373 static uint32_t findBin(double sample);
374
375 logHistogram &operator+=(logHistogram const &o) {
376 zeroCount += o.zeroCount;
377 for (int i = 0; i < numBins; i++) {
378 bins[i].count += o.bins[i].count;
379 bins[i].total += o.bins[i].total;
380 }
381#ifdef KMP_DEBUG
382 _total += o._total;
383 check();
384#endif
385
386 return *this;
387 }
388
389 void addSample(double sample);
390 int minBin() const;
391 int maxBin() const;
392
393 std::string format(char) const;
394};
395
396class statistic {
397 double KMP_ALIGN_CACHE minVal;
398 double maxVal;
399 double meanVal;
400 double m2;
401 uint64_t sampleCount;
402 double offset;
403 bool collectingHist;
404 logHistogram hist;
405
406public:
407 statistic(bool doHist = bool(KMP_STATS_HIST)) {
408 reset();
409 collectingHist = doHist;
410 }
411 statistic(statistic const &o)
412 : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
413 sampleCount(o.sampleCount), offset(o.offset),
414 collectingHist(o.collectingHist), hist(o.hist) {}
415 statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
416 : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
417 sampleCount(sc), offset(0.0), collectingHist(false) {}
418 bool haveHist() const { return collectingHist; }
419 double getMin() const { return minVal; }
420 double getMean() const { return meanVal; }
421 double getMax() const { return maxVal; }
422 uint64_t getCount() const { return sampleCount; }
423 double getSD() const { return sqrt(m2 / sampleCount); }
424 double getTotal() const { return sampleCount * meanVal; }
425 logHistogram const *getHist() const { return &hist; }
426 void setOffset(double d) { offset = d; }
427
428 void reset() {
429 minVal = (std::numeric_limits<double>::max)();
430 maxVal = -minVal;
431 meanVal = 0.0;
432 m2 = 0.0;
433 sampleCount = 0;
434 offset = 0.0;
435 hist.reset();
436 }
437 void addSample(double sample);
438 void scale(double factor);
439 void scaleDown(double f) { scale(1. / f); }
440 void forceCount(uint64_t count) { sampleCount = count; }
441 statistic &operator+=(statistic const &other);
442
443 std::string format(char unit, bool total = false) const;
444 std::string formatHist(char unit) const { return hist.format(unit); }
445};
446
447struct statInfo {
448 const char *name;
449 uint32_t flags;
450};
451
452class timeStat : public statistic {
453 static statInfo timerInfo[];
454
455public:
456 timeStat() : statistic() {}
457 static const char *name(timer_e e) { return timerInfo[e].name; }
458 static bool noTotal(timer_e e) {
459 return timerInfo[e].flags & stats_flags_e::noTotal;
460 }
461 static bool masterOnly(timer_e e) {
462 return timerInfo[e].flags & stats_flags_e::onlyInMaster;
463 }
464 static bool workerOnly(timer_e e) {
465 return timerInfo[e].flags & stats_flags_e::notInMaster;
466 }
467 static bool noUnits(timer_e e) {
468 return timerInfo[e].flags & stats_flags_e::noUnits;
469 }
470 static bool logEvent(timer_e e) {
471 return timerInfo[e].flags & stats_flags_e::logEvent;
472 }
473 static void clearEventFlags() {
474 for (int i = 0; i < TIMER_LAST; i++) {
475 timerInfo[i].flags &= (~(stats_flags_e::logEvent));
476 }
477 }
478};
479
480// Where we need explicitly to start and end the timer, this version can be used
481// Since these timers normally aren't nicely scoped, so don't have a good place
482// to live on the stack of the thread, they're more work to use.
483class explicitTimer {
484 timeStat *stat;
485 timer_e timerEnumValue;
486 tsc_tick_count startTime;
487 tsc_tick_count pauseStartTime;
488 tsc_tick_count::tsc_interval_t totalPauseTime;
489
490public:
491 explicitTimer(timeStat *s, timer_e te)
492 : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
493 totalPauseTime() {}
494
495 // void setStat(timeStat *s) { stat = s; }
496 void start(tsc_tick_count tick);
497 void pause(tsc_tick_count tick) { pauseStartTime = tick; }
498 void resume(tsc_tick_count tick) {
499 totalPauseTime += (tick - pauseStartTime);
500 }
501 void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
502 void reset() {
503 startTime = 0;
504 pauseStartTime = 0;
505 totalPauseTime = 0;
506 }
507 timer_e get_type() const { return timerEnumValue; }
508};
509
510// Where you need to partition a threads clock ticks into separate states
511// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
512// DOING_NOTHING would render these conditions:
513// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
514// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
515// versa
516class partitionedTimers {
517private:
518 std::vector<explicitTimer> timer_stack;
519
520public:
521 partitionedTimers();
522 void init(explicitTimer timer);
523 void exchange(explicitTimer timer);
524 void push(explicitTimer timer);
525 void pop();
526 void windup();
527};
528
529// Special wrapper around the partitioned timers to aid timing code blocks
530// It avoids the need to have an explicit end, leaving the scope suffices.
531class blockPartitionedTimer {
532 partitionedTimers *part_timers;
533
534public:
535 blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
536 : part_timers(pt) {
537 part_timers->push(timer);
538 }
539 ~blockPartitionedTimer() { part_timers->pop(); }
540};
541
542// Special wrapper around the thread state to aid in keeping state in code
543// blocks It avoids the need to have an explicit end, leaving the scope
544// suffices.
545class blockThreadState {
546 stats_state_e *state_pointer;
547 stats_state_e old_state;
548
549public:
550 blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
551 : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
552 *state_pointer = new_state;
553 }
554 ~blockThreadState() { *state_pointer = old_state; }
555};
556
557// If all you want is a count, then you can use this...
558// The individual per-thread counts will be aggregated into a statistic at
559// program exit.
560class counter {
561 uint64_t value;
562 static const statInfo counterInfo[];
563
564public:
565 counter() : value(0) {}
566 void increment() { value++; }
567 uint64_t getValue() const { return value; }
568 void reset() { value = 0; }
569 static const char *name(counter_e e) { return counterInfo[e].name; }
570 static bool masterOnly(counter_e e) {
571 return counterInfo[e].flags & stats_flags_e::onlyInMaster;
572 }
573};
574
575/* ****************************************************************
576 Class to implement an event
577
578 There are four components to an event: start time, stop time
579 nest_level, and timer_name.
580 The start and stop time should be obvious (recorded in clock ticks).
581 The nest_level relates to the bar width in the timeline graph.
582 The timer_name is used to determine which timer event triggered this event.
583
584 the interface to this class is through four read-only operations:
585 1) getStart() -- returns the start time as 64 bit integer
586 2) getStop() -- returns the stop time as 64 bit integer
587 3) getNestLevel() -- returns the nest level of the event
588 4) getTimerName() -- returns the timer name that triggered event
589
590 *MORE ON NEST_LEVEL*
591 The nest level is used in the bar graph that represents the timeline.
592 Its main purpose is for showing how events are nested inside eachother.
593 For example, say events, A, B, and C are recorded. If the timeline
594 looks like this:
595
596Begin -------------------------------------------------------------> Time
597 | | | | | |
598 A B C C B A
599 start start start end end end
600
601 Then A, B, C will have a nest level of 1, 2, 3 respectively.
602 These values are then used to calculate the barwidth so you can
603 see that inside A, B has occurred, and inside B, C has occurred.
604 Currently, this is shown with A's bar width being larger than B's
605 bar width, and B's bar width being larger than C's bar width.
606
607**************************************************************** */
608class kmp_stats_event {
609 uint64_t start;
610 uint64_t stop;
611 int nest_level;
612 timer_e timer_name;
613
614public:
615 kmp_stats_event()
616 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
617 kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
618 : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
619 inline uint64_t getStart() const { return start; }
620 inline uint64_t getStop() const { return stop; }
621 inline int getNestLevel() const { return nest_level; }
622 inline timer_e getTimerName() const { return timer_name; }
623};
624
625/* ****************************************************************
626 Class to implement a dynamically expandable array of events
627
628 ---------------------------------------------------------
629 | event 1 | event 2 | event 3 | event 4 | ... | event N |
630 ---------------------------------------------------------
631
632 An event is pushed onto the back of this array at every
633 explicitTimer->stop() call. The event records the thread #,
634 start time, stop time, and nest level related to the bar width.
635
636 The event vector starts at size INIT_SIZE and grows (doubles in size)
637 if needed. An implication of this behavior is that log(N)
638 reallocations are needed (where N is number of events). If you want
639 to avoid reallocations, then set INIT_SIZE to a large value.
640
641 the interface to this class is through six operations:
642 1) reset() -- sets the internal_size back to 0 but does not deallocate any
643 memory
644 2) size() -- returns the number of valid elements in the vector
645 3) push_back(start, stop, nest, timer_name) -- pushes an event onto
646 the back of the array
647 4) deallocate() -- frees all memory associated with the vector
648 5) sort() -- sorts the vector by start time
649 6) operator[index] or at(index) -- returns event reference at that index
650**************************************************************** */
651class kmp_stats_event_vector {
652 kmp_stats_event *events;
653 int internal_size;
654 int allocated_size;
655 static const int INIT_SIZE = 1024;
656
657public:
658 kmp_stats_event_vector() {
659 events =
660 (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
661 internal_size = 0;
662 allocated_size = INIT_SIZE;
663 }
664 ~kmp_stats_event_vector() {}
665 inline void reset() { internal_size = 0; }
666 inline int size() const { return internal_size; }
667 void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
668 timer_e name) {
669 int i;
670 if (internal_size == allocated_size) {
671 kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
672 sizeof(kmp_stats_event) * allocated_size * 2);
673 for (i = 0; i < internal_size; i++)
674 tmp[i] = events[i];
675 __kmp_free(events);
676 events = tmp;
677 allocated_size *= 2;
678 }
679 events[internal_size] =
680 kmp_stats_event(start_time, stop_time, nest_level, name);
681 internal_size++;
682 return;
683 }
684 void deallocate();
685 void sort();
686 const kmp_stats_event &operator[](int index) const { return events[index]; }
687 kmp_stats_event &operator[](int index) { return events[index]; }
688 const kmp_stats_event &at(int index) const { return events[index]; }
689 kmp_stats_event &at(int index) { return events[index]; }
690};
691
692/* ****************************************************************
693 Class to implement a doubly-linked, circular, statistics list
694
695 |---| ---> |---| ---> |---| ---> |---| ---> ... next
696 | | | | | | | |
697 |---| <--- |---| <--- |---| <--- |---| <--- ... prev
698 Sentinel first second third
699 Node node node node
700
701 The Sentinel Node is the user handle on the list.
702 The first node corresponds to thread 0's statistics.
703 The second node corresponds to thread 1's statistics and so on...
704
705 Each node has a _timers, _counters, and _explicitTimers array to hold that
706 thread's statistics. The _explicitTimers point to the correct _timer and
707 update its statistics at every stop() call. The explicitTimers' pointers are
708 set up in the constructor. Each node also has an event vector to hold that
709 thread's timing events. The event vector expands as necessary and records
710 the start-stop times for each timer.
711
712 The nestLevel variable is for plotting events and is related
713 to the bar width in the timeline graph.
714
715 Every thread will have a thread local pointer to its node in
716 the list. The sentinel node is used by the primary thread to
717 store "dummy" statistics before __kmp_create_worker() is called.
718**************************************************************** */
719class kmp_stats_list {
720 int gtid;
721 timeStat _timers[TIMER_LAST + 1];
722 counter _counters[COUNTER_LAST + 1];
723 explicitTimer thread_life_timer;
724 partitionedTimers _partitionedTimers;
725 int _nestLevel; // one per thread
726 kmp_stats_event_vector _event_vector;
727 kmp_stats_list *next;
728 kmp_stats_list *prev;
729 stats_state_e state;
730 int thread_is_idle_flag;
731
732public:
733 kmp_stats_list()
734 : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
735 TIMER_OMP_worker_thread_life),
736 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
737 thread_is_idle_flag(0) {}
738 ~kmp_stats_list() {}
739 inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
740 inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
741 inline partitionedTimers *getPartitionedTimers() {
742 return &_partitionedTimers;
743 }
744 inline timeStat *getTimers() { return _timers; }
745 inline counter *getCounters() { return _counters; }
746 inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
747 inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
748 inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
749 inline void resetEventVector() { _event_vector.reset(); }
750 inline void incrementNestValue() { _nestLevel++; }
751 inline int getNestValue() { return _nestLevel; }
752 inline void decrementNestValue() { _nestLevel--; }
753 inline int getGtid() const { return gtid; }
754 inline void setGtid(int newgtid) { gtid = newgtid; }
755 inline void setState(stats_state_e newstate) { state = newstate; }
756 inline stats_state_e getState() const { return state; }
757 inline stats_state_e *getStatePointer() { return &state; }
758 inline bool isIdle() { return thread_is_idle_flag == 1; }
759 inline void setIdleFlag() { thread_is_idle_flag = 1; }
760 inline void resetIdleFlag() { thread_is_idle_flag = 0; }
761 kmp_stats_list *push_back(int gtid); // returns newly created list node
762 inline void push_event(uint64_t start_time, uint64_t stop_time,
763 int nest_level, timer_e name) {
764 _event_vector.push_back(start_time, stop_time, nest_level, name);
765 }
766 void deallocate();
767 class iterator;
768 kmp_stats_list::iterator begin();
769 kmp_stats_list::iterator end();
770 int size();
771 class iterator {
772 kmp_stats_list *ptr;
773 friend kmp_stats_list::iterator kmp_stats_list::begin();
774 friend kmp_stats_list::iterator kmp_stats_list::end();
775
776 public:
777 iterator();
778 ~iterator();
779 iterator operator++();
780 iterator operator++(int dummy);
781 iterator operator--();
782 iterator operator--(int dummy);
783 bool operator!=(const iterator &rhs);
784 bool operator==(const iterator &rhs);
785 kmp_stats_list *operator*() const; // dereference operator
786 };
787};
788
789/* ****************************************************************
790 Class to encapsulate all output functions and the environment variables
791
792 This module holds filenames for various outputs (normal stats, events, plot
793 file), as well as coloring information for the plot file.
794
795 The filenames and flags variables are read from environment variables.
796 These are read once by the constructor of the global variable
797 __kmp_stats_output which calls init().
798
799 During this init() call, event flags for the timeStat::timerInfo[] global
800 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
801
802 The only interface function that is public is outputStats(heading). This
803 function should print out everything it needs to, either to files or stderr,
804 depending on the environment variables described below
805
806 ENVIRONMENT VARIABLES:
807 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
808 file, otherwise, print to stderr
809 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
810 either KMP_STATS_FILE or stderr
811 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
812 otherwise, the plot file is sent to "events.plt"
813 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
814 events
815 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
816 otherwise, output is sent to "events.dat"
817**************************************************************** */
818class kmp_stats_output_module {
819
820public:
821 struct rgb_color {
822 float r;
823 float g;
824 float b;
825 };
826
827private:
828 std::string outputFileName;
829 static const char *eventsFileName;
830 static const char *plotFileName;
831 static int printPerThreadFlag;
832 static int printPerThreadEventsFlag;
833 static const rgb_color globalColorArray[];
834 static rgb_color timerColorInfo[];
835
836 void init();
837 static void setupEventColors();
838 static void printPloticusFile();
839 static void printHeaderInfo(FILE *statsOut);
840 static void printTimerStats(FILE *statsOut, statistic const *theStats,
841 statistic const *totalStats);
842 static void printCounterStats(FILE *statsOut, statistic const *theStats);
843 static void printCounters(FILE *statsOut, counter const *theCounters);
844 static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
845 int gtid);
846 static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
847 static void windupExplicitTimers();
848 bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
849
850public:
851 kmp_stats_output_module() { init(); }
852 void outputStats(const char *heading);
853};
854
855#ifdef __cplusplus
856extern "C" {
857#endif
858void __kmp_stats_init();
859void __kmp_stats_fini();
860void __kmp_reset_stats();
861void __kmp_output_stats(const char *);
862void __kmp_accumulate_stats_at_exit(void);
863// thread local pointer to stats node within list
864extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
865// head to stats list.
866extern kmp_stats_list *__kmp_stats_list;
867// lock for __kmp_stats_list
868extern kmp_tas_lock_t __kmp_stats_lock;
869// reference start time
870extern tsc_tick_count __kmp_stats_start_time;
871// interface to output
872extern kmp_stats_output_module __kmp_stats_output;
873
874#ifdef __cplusplus
875}
876#endif
877
878// Simple, standard interfaces that drop out completely if stats aren't enabled
879
891#define KMP_COUNT_VALUE(name, value) \
892 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
893
904#define KMP_COUNT_BLOCK(name) \
905 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
906
924#define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
925
933#define KMP_INIT_PARTITIONED_TIMERS(name) \
934 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
935 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
936
937#define KMP_TIME_PARTITIONED_BLOCK(name) \
938 blockPartitionedTimer __PBLOCKTIME__( \
939 __kmp_stats_thread_ptr->getPartitionedTimers(), \
940 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
941 TIMER_##name))
942
943#define KMP_PUSH_PARTITIONED_TIMER(name) \
944 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
945 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
946
947#define KMP_POP_PARTITIONED_TIMER() \
948 __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
949
950#define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
951 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
952 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
953
954#define KMP_SET_THREAD_STATE(state_name) \
955 __kmp_stats_thread_ptr->setState(state_name)
956
957#define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
958
959#define KMP_SET_THREAD_STATE_BLOCK(state_name) \
960 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
961 state_name)
962
970#define KMP_RESET_STATS() __kmp_reset_stats()
971
972#if (KMP_DEVELOPER_STATS)
973#define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
974#define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
975#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
976#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
977#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
978#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
979 KMP_EXCHANGE_PARTITIONED_TIMER(n)
980#else
981// Null definitions
982#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
983#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
984#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
985#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
986#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
987#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
988#endif
989
990#else // KMP_STATS_ENABLED
991
992// Null definitions
993#define KMP_COUNT_VALUE(n, v) ((void)0)
994#define KMP_COUNT_BLOCK(n) ((void)0)
995
996#define KMP_OUTPUT_STATS(heading_string) ((void)0)
997#define KMP_RESET_STATS() ((void)0)
998
999#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1000#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1001#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1002#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1003#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1004#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1005#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1006#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1007#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1008#define KMP_POP_PARTITIONED_TIMER() ((void)0)
1009#define KMP_SET_THREAD_STATE(state_name) ((void)0)
1010#define KMP_GET_THREAD_STATE() ((void)0)
1011#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1012#endif // KMP_STATS_ENABLED
1013
1014#endif // KMP_STATS_H
stats_flags_e
flags to describe the statistic (timer or counter)
Definition: kmp_stats.h:49
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition: kmp_stats.h:95
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition: kmp_stats.h:296
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
@ notInMaster
statistic is valid only for non-primary threads
Definition: kmp_stats.h:53
@ noUnits
statistic doesn't need units printed next to it
Definition: kmp_stats.h:52
@ logEvent
Definition: kmp_stats.h:54
@ noTotal
do not show a TOTAL_aggregation for this statistic
Definition: kmp_stats.h:50
@ onlyInMaster
statistic is valid only for primary thread
Definition: kmp_stats.h:51