-
Notifications
You must be signed in to change notification settings - Fork 3k
/
Copy pathmfu.h
244 lines (218 loc) · 10.1 KB
/
mfu.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#ifndef MFU_H
#define MFU_H
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if __has_include(<nvml.h>)
#define USE_NVML 1
#include <nvml.h>
#else
#define USE_NVML 0
#endif
// tied to enum PrecisionMode, in a future refactor make them the same
#define MFUH_PRECISION_FP32 0
#define MFUH_PRECISION_FP16 1
#define MFUH_PRECISION_BF16 2
#if USE_NVML
inline void nvml_check(nvmlReturn_t status, const char *file, int line) {
if (status != NVML_SUCCESS) {
printf("[NVML ERROR] at file %s:%d:\n%s\n", file, line, nvmlErrorString(status));
exit(EXIT_FAILURE);
}
};
#define nvmlCheck(err) (nvml_check(err, __FILE__, __LINE__))
#endif
typedef struct {
float TF_32; // tensor-core performance 32 bit
float BF_16_32; // bf16 with 32 bit accumulate
float FP_16_32; // fp16 with 32 bit accumulate
float FP_16_16; // fp16 with 16 bit accumulate
float FP_8_32; // and so on
float FP_8_16;
float CLOCK; // clock frequency from the spec sheet
float CORES; // #TCs from the spec sheet
} PerfData;
// basic default data from the nvidia whitepapers
static const PerfData VOLTA = {125.0f, -1.f, 125.f, -1.f, -1.f, -1.f, 1530.f, 640.f};
static const PerfData AMPERE_DATACENTER = {156.f, 312.f, 312.f, 312.f, -1.f, -1.f, 1410.f, 432.f};
static const PerfData AMPERE_CONSUMER = {40.f, 80.f, 80.f, 160.f, -1.f, -1.f, 1860.f, 336.f};
static const PerfData HOPPER = {378.f, 756.f, 756.f, 756.f, 1513.f, 1513.f, 1620.f, 456.f};
static const PerfData ADA = {82.6f, 165.2f, 165.2f, 330.3f, 330.3f, 660.6f, 2520.f, 512.f};
typedef struct {
const char* name;
const PerfData* perf_data;
float new_cores;
float new_mhz;
} GPUEntry;
// the overrides for each specific GPU
static GPUEntry gpu_db[] = {
{"Tesla V100-SXM2-16GB", &VOLTA, 640, 1530},
{"Tesla V100-PCIE-32GB", &VOLTA, 640, 1530},
{"NVIDIA A100-PCIE-40GB", &ERE_DATACENTER, 432, 1410},
{"NVIDIA A100-PCIE-80GB", &ERE_DATACENTER, 432, 1410},
{"NVIDIA A100-SXM4-40GB", &ERE_DATACENTER, 432, 1410},
{"NVIDIA A100-SXM4-80GB", &ERE_DATACENTER, 432, 1410},
{"NVIDIA RTX A2000", &ERE_CONSUMER, 104, 1200},
{"NVIDIA RTX A4000", &ERE_CONSUMER, 192, 1560},
{"NVIDIA RTX A4500", &ERE_CONSUMER, 224, 1650},
{"NVIDIA RTX A5000", &ERE_CONSUMER, 256, 1695},
{"NVIDIA RTX A5500", &ERE_CONSUMER, 320, 1770},
{"NVIDIA RTX A6000", &ERE_CONSUMER, 336, 1800},
{"NVIDIA GeForce RTX 3090 Ti", &ERE_CONSUMER, 336, 1860},
{"NVIDIA GeForce RTX 3090", &ERE_CONSUMER, 328, 1695},
{"NVIDIA GeForce RTX 3080 Ti", &ERE_CONSUMER, 320, 1665},
{"NVIDIA GeForce RTX 3080", &ERE_CONSUMER, 272, 1710},
{"NVIDIA GeForce RTX 3070 Ti", &ERE_CONSUMER, 192, 1770},
{"NVIDIA GeForce RTX 3070", &ERE_CONSUMER, 184, 1725},
{"NVIDIA GeForce RTX 3060 Ti", &ERE_CONSUMER, 152, 1665},
{"NVIDIA GeForce RTX 3060", &ERE_CONSUMER, 112, 1777},
{"NVIDIA RTX A2000 ADA", &ADA, 88, 2130},
{"NVIDIA RTX A4000 ADA", &ADA, 192, 2175},
{"NVIDIA RTX A4500 ADA", &ADA, 224, 2580},
{"NVIDIA RTX A5000 ADA", &ADA, 400, 2550},
{"NVIDIA RTX A5880 ADA", &ADA, 440, 2460},
{"NVIDIA RTX A6000 ADA", &ADA, 568, 2505},
{"NVIDIA GeForce RTX 4090", &ADA, 512, 2520},
{"NVIDIA GeForce RTX 4080 SUPER", &ADA, 320, 2550},
{"NVIDIA GeForce RTX 4080", &ADA, 304, 2505},
{"NVIDIA GeForce RTX 4070 Ti SUPER", &ADA, 264, 2610},
{"NVIDIA GeForce RTX 4070 Ti", &ADA, 240, 2610},
{"NVIDIA GeForce RTX 4070 SUPER", &ADA, 224, 2475},
{"NVIDIA GeForce RTX 4070", &ADA, 184, 2475},
{"NVIDIA GeForce RTX 4070", &ADA, 184, 2475},
{"NVIDIA GeForce RTX 4060 Ti", &ADA, 136, 2535},
{"NVIDIA GeForce RTX 4060", &ADA, 96, 2460},
{"NVIDIA H100 PCIe", &HOPPER, 456, 1620},
{"NVIDIA H100 80GB HBM3", &HOPPER, 528, 1830}, // HBM3 = SXM5
};
float get_flops_promised(const char* device, int precision_mode) {
/*
This function is used to estimate the Model Flops Utilization (MFU)
basically we have to figure out how many flops the GPU can do per second.
Note that this is not a simple endeavor and may well go wrong! The details are tricky.
The returned value is in units of 1e12.
For the non-top models, actual performance numbers aren't that easy to find, e.g.,
here https://www.techpowerup.com/gpu-specs/rtx-a4000.c3756, does "Theoretical Performance"
seems to be without tensor cores.
So, instead we use that all these cards just use the same types of tensor cores in different
numbers and at different frequencies. Then we just need to look up these two easily accesible
numbers for all the other GPUs.
linear scaling seems to work: comparing spec sheet and calculation:
4080: 304TCs, 2505 GHz; 97.5TFlops = 165.2/512*304 /2520 * 2505
Original numbers for the top GPUS are from.
https://resources.nvidia.com/en-us-tensor-core
https://images.nvidia.com/aem-dam/Solutions/geforce/ada/nvidia-ada-gpu-architecture.pdf
*/
// validate the precision mode as one of the three possible values
if (!(precision_mode == MFUH_PRECISION_FP32 || precision_mode == MFUH_PRECISION_FP16 || precision_mode == MFUH_PRECISION_BF16)) {
fprintf(stderr, "Invalid precision mode: %d\n", precision_mode);
return -1.0f;
}
// do a linear search until you find our GPU, then calculate the flops promised
int num_gpu_entries = sizeof(gpu_db) / sizeof(gpu_db[0]);
for (int i = 0; i < num_gpu_entries; i++) {
if (strcmp(gpu_db[i].name, device) == 0) {
const PerfData* perf_data = gpu_db[i].perf_data;
// look up the default flops value for the given precision mode
float value = -1.0f;
if (precision_mode == MFUH_PRECISION_BF16) { value = perf_data->BF_16_32; }
if (precision_mode == MFUH_PRECISION_FP32) { value = perf_data->TF_32; }
if (precision_mode == MFUH_PRECISION_FP16) { value = perf_data->FP_16_32; }
// we'd get here if we're e.g. trying to use BF16 on Volta GPU or something...
if (value < 0.0f) {
fprintf(stderr, "No data for GPU %s and precision mode %d\n", device, precision_mode);
return -1.0f;
}
// adjust flops based on the specific core count and clock frequency of this GPU
float new_cores = gpu_db[i].new_cores;
float new_mhz = gpu_db[i].new_mhz;
float adjusted = value * (new_cores / perf_data->CORES) * (new_mhz / perf_data->CLOCK);
return adjusted;
}
}
return -1.0f; // ¯\_(ツ)_/¯
}
struct GPUUtilInfo {
unsigned int clock;
unsigned int max_clock;
unsigned int power;
unsigned int power_limit;
unsigned int fan;
unsigned int temperature;
unsigned int temp_slowdown;
float gpu_utilization;
float mem_utilization;
const char* throttle_reason;
};
// lazily initialize nvml and generate a handle to the GPU
#if USE_NVML
nvmlDevice_t nvml_get_device() {
static bool needs_init = true;
static nvmlDevice_t device;
if(needs_init) {
needs_init = false;
nvmlCheck(nvmlInit());
nvmlCheck(nvmlDeviceGetHandleByIndex_v2(0, &device));
}
return device;
}
// convert throttle reason bitfield into a text reason.
// this is a lossy conversion; we just want to give some idea of what is happening
const char* get_throttle_reason(unsigned long long bits) {
if(bits & (nvmlClocksThrottleReasonSwPowerCap | nvmlClocksThrottleReasonHwPowerBrakeSlowdown)) {
return "power cap";
} else if (bits & (nvmlClocksThrottleReasonSwThermalSlowdown | nvmlClocksThrottleReasonHwThermalSlowdown)) {
return "thermal cap";
} else if (bits & (nvmlClocksThrottleReasonAll)) {
return "other cap";
} else {
return "no cap";
}
}
// gather data for a GPUUtilInfo object
GPUUtilInfo get_gpu_utilization_info() {
GPUUtilInfo info;
nvmlDevice_t device = nvml_get_device();
// query different infos directly
nvmlCheck(nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &info.clock));
nvmlCheck(nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &info.max_clock));
nvmlCheck(nvmlDeviceGetPowerManagementLimit(device, &info.power_limit));
nvmlCheck(nvmlDeviceGetPowerUsage(device, &info.power));
nvmlCheck(nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info.temperature));
nvmlCheck(nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &info.temp_slowdown));
unsigned long long throttle;
nvmlCheck(nvmlDeviceGetCurrentClocksThrottleReasons(device, &throttle));
info.throttle_reason = get_throttle_reason(throttle);
nvmlCheck(nvmlDeviceGetFanSpeed(device, &info.fan));
// for "utilization", we look at recorded samples. In principle, we could query the driver for how many samples
// to request, but then we'd need to dynamically allocate sufficient space. Let's just hard-code a limit of 128,
// and have no memory management required
constexpr const int BUFFER_LIMIT = 128;
nvmlSample_t buffer[BUFFER_LIMIT];
nvmlValueType_t v_type;
unsigned int sample_count = BUFFER_LIMIT;
nvmlCheck(nvmlDeviceGetSamples(device, NVML_GPU_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
float gpu_utilization = 0.f;
for(unsigned i = 0; i < sample_count; ++i) {
gpu_utilization += (float)buffer[i].sampleValue.uiVal;
}
gpu_utilization /= (float)sample_count;
// sample count may have been modified by the query above; reset back to buffer size
sample_count = BUFFER_LIMIT;
nvmlCheck(nvmlDeviceGetSamples(device, NVML_MEMORY_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
float mem_utilization = 0.f;
for(unsigned i = 0; i < sample_count; ++i) {
mem_utilization += (float)buffer[i].sampleValue.uiVal;
}
mem_utilization /= (float)sample_count;
info.gpu_utilization = gpu_utilization;
info.mem_utilization = mem_utilization;
return info;
}
#else
GPUUtilInfo get_gpu_utilization_info() {
fprintf(stderr, "Error: Compiled without nvml support. Cannot perform additional GPU state tracking.");
exit(EXIT_FAILURE);
}
#endif
#endif // MFU_H