|
13 | 13 |
|
14 | 14 | #if defined(HAVE_GCC_ABI_DEMANGLE) |
15 | 15 | #include <cxxabi.h> |
16 | | -#endif // HAVE_GCC_ABI_DEMANGLE |
| 16 | +#endif // HAVE_GCC_ABI_DEMANGLE |
17 | 17 |
|
18 | 18 | #include "kp_kernel_timer.h" |
19 | 19 |
|
20 | 20 | #include <ldms/ldms.h> |
21 | 21 | #include <ldms/ldmsd_stream.h> |
22 | 22 | #include <ovis_util/util.h> |
23 | 23 |
|
24 | | -char* demangleName(char* kernelName) |
25 | | -{ |
| 24 | +char* demangleName(char* kernelName) { |
26 | 25 | #if defined(HAVE_GCC_ABI_DEMANGLE) |
27 | | - int status = -1; |
28 | | - char* demangledKernelName = abi::__cxa_demangle(kernelName, NULL, NULL, &status); |
29 | | - if (status==0) { |
30 | | - free(kernelName); |
31 | | - kernelName = demangledKernelName; |
32 | | - } |
33 | | -#endif // HAVE_GCC_ABI_DEMANGLE |
34 | | - return kernelName; |
| 26 | + int status = -1; |
| 27 | + char* demangledKernelName = |
| 28 | + abi::__cxa_demangle(kernelName, NULL, NULL, &status); |
| 29 | + if (status == 0) { |
| 30 | + free(kernelName); |
| 31 | + kernelName = demangledKernelName; |
| 32 | + } |
| 33 | +#endif // HAVE_GCC_ABI_DEMANGLE |
| 34 | + return kernelName; |
35 | 35 | } |
36 | 36 |
|
37 | 37 | enum KernelExecutionType { |
38 | | - PARALLEL_FOR = 0, |
39 | | - PARALLEL_REDUCE = 1, |
40 | | - PARALLEL_SCAN = 2, |
41 | | - REGION = 3 |
| 38 | + PARALLEL_FOR = 0, |
| 39 | + PARALLEL_REDUCE = 1, |
| 40 | + PARALLEL_SCAN = 2, |
| 41 | + REGION = 3 |
42 | 42 | }; |
43 | 43 |
|
44 | 44 | static uint64_t kernel_ex = 0; |
45 | | -static double total_time = 0; |
| 45 | +static double total_time = 0; |
46 | 46 |
|
47 | 47 | class KernelPerformanceInfo { |
48 | | - public: |
49 | | - |
50 | | - KernelPerformanceInfo(std::string kName, KernelExecutionType kernelType, ldms_t* the_ldms, |
51 | | - const char* node_name, |
52 | | - const int rank_no, |
53 | | - const int job_id, |
54 | | - const double job_start, |
55 | | - const uint64_t job_epoch_start, |
56 | | - const uint16_t kernel_nest_level, |
57 | | - const int tool_verbosity, |
58 | | - bool* ldms_global_publish): |
59 | | - kType(kernelType), ldms(the_ldms), |
60 | | - nodename(node_name), rank(rank_no), |
61 | | - jobid(job_id), jobStartTime(job_start), |
62 | | - jobStartEpochTimeMS(job_epoch_start), |
63 | | - nestingLevel(kernel_nest_level), |
64 | | - verbosity(tool_verbosity), |
65 | | - ldms_publish(ldms_global_publish) { |
66 | | - |
67 | | - kernelName = (char*) malloc(sizeof(char) * (kName.size() + 1)); |
68 | | - strcpy(kernelName, kName.c_str()); |
69 | | - |
70 | | - callCount = 0; |
71 | | - |
72 | | - const char* tool_sample_rate = getenv("KOKKOS_SAMPLER_RATE"); |
73 | | - kernelSampleRate = 0; |
74 | | - |
75 | | - if (NULL != tool_sample_rate) { |
76 | | - kernelSampleRate = atoi(tool_sample_rate); |
77 | | - } else { |
78 | | - kernelSampleRate = 1; |
79 | | - } |
80 | | - } |
81 | | - |
82 | | - ~KernelPerformanceInfo() { |
83 | | - free(kernelName); |
84 | | - } |
85 | | - |
86 | | - KernelExecutionType getKernelType() { |
87 | | - return kType; |
88 | | - } |
89 | | - |
90 | | - void incrementCount() { |
91 | | - callCount++; |
92 | | - kernel_ex++; |
93 | | - } |
94 | | - |
95 | | - void addTime(double t) { |
96 | | - timeSq += (t*t); |
97 | | - total_time += t; |
98 | | - } |
99 | | - |
100 | | - void addFromTimer() { |
101 | | - const double now = seconds(); |
102 | | - const double sample_time = now - startTime; |
103 | | - addTime(sample_time); |
104 | | - incrementCount(); |
105 | | - |
106 | | - if( (*ldms_publish) ) { |
107 | | - const int buffer_size = (NULL == kernelName) ? 4096 : |
108 | | - ( strlen(kernelName) > 3072 ? 2048 + strlen(kernelName) : 4096 ); |
109 | | - |
110 | | - char* big_buffer = (char*) malloc( sizeof(char) * buffer_size ); |
111 | | - |
112 | | - double epoch_stamp = (double) jobStartEpochTimeMS; |
113 | | - epoch_stamp += static_cast<double>( now - jobStartTime ) * 1000.0; |
114 | | - epoch_stamp = epoch_stamp / 1000.0; |
115 | | - |
116 | | - snprintf( big_buffer, buffer_size, "{ \"job-id\" : %d, \"node-name\" : \"%s\", \"rank\" : %d, \"timestamp\" : \"%.6f\", \"kokkos-perf-data\" : [ { \"name\" : \"%s\", \"type\" : %d, \"current-kernel-count\" : %llu, \"total-kernel-count\" : %llu, \"level\" : %u, \"current-kernel-time\" : %.9f, \"total-kernel-time\" : %.9f } ] }\n", |
117 | | - jobid, nodename, rank, epoch_stamp, |
118 | | - (NULL==kernelName) ? "" : kernelName, |
119 | | - (int) kType, callCount, kernel_ex * kernelSampleRate, nestingLevel, sample_time, total_time ); |
120 | | - |
121 | | - if( verbosity > 0 ) { |
122 | | - printf("%s", big_buffer); |
123 | | - } |
124 | | - |
125 | | - int rc = ldmsd_stream_publish( (*ldms), "kokkos-perf-data", LDMSD_STREAM_JSON, |
126 | | - big_buffer, strlen(big_buffer) + 1); |
127 | | - |
128 | | - //int rc = ldmsd_stream_publish( (*ldms), "kokkos-perf-data", LDMSD_STREAM_JSON, big_buffer, strlen(big_buffer) + 1); |
129 | | - // always check your return codes :p |
130 | | - free( big_buffer ); |
131 | | - } |
132 | | - } |
133 | | - |
134 | | - void startTimer() { |
135 | | - startTime = seconds(); |
136 | | - } |
137 | | - |
138 | | - uint64_t getCallCount() { |
139 | | - return callCount; |
140 | | - } |
141 | | - |
142 | | - double getTime() { |
143 | | - return time; |
144 | | - } |
145 | | - |
146 | | - double getTimeSq() { |
147 | | - return timeSq; |
148 | | - } |
149 | | - |
150 | | - char* getName() { |
151 | | - return kernelName; |
152 | | - } |
153 | | - |
154 | | - |
155 | | - private: |
156 | | - char* kernelName; |
157 | | - uint64_t callCount; |
158 | | - double time; |
159 | | - double timeSq; |
160 | | - double startTime; |
161 | | - double jobStartTime; |
162 | | - |
163 | | - uint64_t kernelSampleRate; |
164 | | - |
165 | | - KernelExecutionType kType; |
166 | | - ldms_t* ldms; |
167 | | - |
168 | | - bool* ldms_publish; |
169 | | - const uint16_t nestingLevel; |
170 | | - const char* nodename; |
171 | | - const int rank; |
172 | | - const int jobid; |
173 | | - const int verbosity; |
174 | | - const uint64_t jobStartEpochTimeMS; |
| 48 | + public: |
| 49 | + KernelPerformanceInfo(std::string kName, KernelExecutionType kernelType, |
| 50 | + ldms_t* the_ldms, const char* node_name, |
| 51 | + const int rank_no, const int job_id, |
| 52 | + const double job_start, const uint64_t job_epoch_start, |
| 53 | + const uint16_t kernel_nest_level, |
| 54 | + const int tool_verbosity, bool* ldms_global_publish) |
| 55 | + : kType(kernelType), |
| 56 | + ldms(the_ldms), |
| 57 | + nodename(node_name), |
| 58 | + rank(rank_no), |
| 59 | + jobid(job_id), |
| 60 | + jobStartTime(job_start), |
| 61 | + jobStartEpochTimeMS(job_epoch_start), |
| 62 | + nestingLevel(kernel_nest_level), |
| 63 | + verbosity(tool_verbosity), |
| 64 | + ldms_publish(ldms_global_publish) { |
| 65 | + kernelName = (char*)malloc(sizeof(char) * (kName.size() + 1)); |
| 66 | + strcpy(kernelName, kName.c_str()); |
| 67 | + |
| 68 | + callCount = 0; |
| 69 | + |
| 70 | + const char* tool_sample_rate = getenv("KOKKOS_SAMPLER_RATE"); |
| 71 | + kernelSampleRate = 0; |
| 72 | + |
| 73 | + if (NULL != tool_sample_rate) { |
| 74 | + kernelSampleRate = atoi(tool_sample_rate); |
| 75 | + } else { |
| 76 | + kernelSampleRate = 1; |
| 77 | + } |
| 78 | + } |
| 79 | + |
| 80 | + ~KernelPerformanceInfo() { free(kernelName); } |
| 81 | + |
| 82 | + KernelExecutionType getKernelType() { return kType; } |
| 83 | + |
| 84 | + void incrementCount() { |
| 85 | + callCount++; |
| 86 | + kernel_ex++; |
| 87 | + } |
| 88 | + |
| 89 | + void addTime(double t) { |
| 90 | + timeSq += (t * t); |
| 91 | + total_time += t; |
| 92 | + } |
| 93 | + |
| 94 | + void addFromTimer() { |
| 95 | + const double now = seconds(); |
| 96 | + const double sample_time = now - startTime; |
| 97 | + addTime(sample_time); |
| 98 | + incrementCount(); |
| 99 | + |
| 100 | + if ((*ldms_publish)) { |
| 101 | + const int buffer_size = |
| 102 | + (NULL == kernelName) |
| 103 | + ? 4096 |
| 104 | + : (strlen(kernelName) > 3072 ? 2048 + strlen(kernelName) : 4096); |
| 105 | + |
| 106 | + char* big_buffer = (char*)malloc(sizeof(char) * buffer_size); |
| 107 | + |
| 108 | + double epoch_stamp = (double)jobStartEpochTimeMS; |
| 109 | + epoch_stamp += static_cast<double>(now - jobStartTime) * 1000.0; |
| 110 | + epoch_stamp = epoch_stamp / 1000.0; |
| 111 | + |
| 112 | + snprintf(big_buffer, buffer_size, |
| 113 | + "{ \"job-id\" : %d, \"node-name\" : \"%s\", \"rank\" : %d, " |
| 114 | + "\"timestamp\" : \"%.6f\", \"kokkos-perf-data\" : [ { \"name\" " |
| 115 | + ": \"%s\", \"type\" : %d, \"current-kernel-count\" : %llu, " |
| 116 | + "\"total-kernel-count\" : %llu, \"level\" : %u, " |
| 117 | + "\"current-kernel-time\" : %.9f, \"total-kernel-time\" : %.9f } " |
| 118 | + "] }\n", |
| 119 | + jobid, nodename, rank, epoch_stamp, |
| 120 | + (NULL == kernelName) ? "" : kernelName, (int)kType, callCount, |
| 121 | + kernel_ex * kernelSampleRate, nestingLevel, sample_time, |
| 122 | + total_time); |
| 123 | + |
| 124 | + if (verbosity > 0) { |
| 125 | + printf("%s", big_buffer); |
| 126 | + } |
| 127 | + |
| 128 | + int rc = |
| 129 | + ldmsd_stream_publish((*ldms), "kokkos-perf-data", LDMSD_STREAM_JSON, |
| 130 | + big_buffer, strlen(big_buffer) + 1); |
| 131 | + |
| 132 | + // int rc = ldmsd_stream_publish( (*ldms), "kokkos-perf-data", |
| 133 | + // LDMSD_STREAM_JSON, big_buffer, strlen(big_buffer) + 1); |
| 134 | + // always check your return codes :p |
| 135 | + free(big_buffer); |
| 136 | + } |
| 137 | + } |
| 138 | + |
| 139 | + void startTimer() { startTime = seconds(); } |
| 140 | + |
| 141 | + uint64_t getCallCount() { return callCount; } |
| 142 | + |
| 143 | + double getTime() { return time; } |
| 144 | + |
| 145 | + double getTimeSq() { return timeSq; } |
| 146 | + |
| 147 | + char* getName() { return kernelName; } |
| 148 | + |
| 149 | + private: |
| 150 | + char* kernelName; |
| 151 | + uint64_t callCount; |
| 152 | + double time; |
| 153 | + double timeSq; |
| 154 | + double startTime; |
| 155 | + double jobStartTime; |
| 156 | + |
| 157 | + uint64_t kernelSampleRate; |
| 158 | + |
| 159 | + KernelExecutionType kType; |
| 160 | + ldms_t* ldms; |
| 161 | + |
| 162 | + bool* ldms_publish; |
| 163 | + const uint16_t nestingLevel; |
| 164 | + const char* nodename; |
| 165 | + const int rank; |
| 166 | + const int jobid; |
| 167 | + const int verbosity; |
| 168 | + const uint64_t jobStartEpochTimeMS; |
175 | 169 | }; |
176 | 170 |
|
177 | 171 | #endif |
0 commit comments