Skip to content

Commit ad384a0

Browse files
author
Michal Kapalka
committed
Add cgroups CPU quota and throttling metrics
Add metrics related to CPU quotas and CPU throttling (Linux CFS bandwidth control), as well as the total CPU usage from Linux cgroups CPU accounting. Those metrics can be useful in multi-tenant cloud environments, in particular on Elastic Cloud nodes that use CPU boosting (vCPU credits).
1 parent 46721e1 commit ad384a0

File tree

4 files changed

+142
-5
lines changed

4 files changed

+142
-5
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,11 @@ Further Information
221221
| elasticsearch_jvm_memory_pool_max_bytes | counter | 3 | JVM memory max by pool |
222222
| elasticsearch_jvm_memory_pool_peak_used_bytes | counter | 3 | JVM memory peak used by pool |
223223
| elasticsearch_jvm_memory_pool_peak_max_bytes | counter | 3 | JVM memory peak max by pool |
224+
| elasticsearch_os_cgroup_cpu_cfs_period_micros | gauge | 1 | CPU period length in microseconds (Linux CFS bandwidth control) |
225+
| elasticsearch_os_cgroup_cpu_cfs_quota_micros | gauge | 1 | CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control) |
226+
| elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled | counter | 1 | Number of times the process has been throttled (Linux CFS bandwidth control) |
227+
| elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos | counter | 1 | Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control) |
228+
| elasticsearch_os_cgroup_cpuacct_usage_nanos | counter | 1 | Total CPU usage in nanoseconds (Linux cgroups CPU accounting) |
224229
| elasticsearch_os_cpu_percent | gauge | 1 | Percent CPU used by the OS |
225230
| elasticsearch_os_load1 | gauge | 1 | Shortterm load average |
226231
| elasticsearch_os_load5 | gauge | 1 | Midterm load average |

collector/nodes.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,66 @@ func NewNodes(logger *slog.Logger, client *http.Client, url *url.URL, all bool,
286286
},
287287
Labels: defaultNodeLabelValues,
288288
},
289+
{
290+
Type: prometheus.GaugeValue,
291+
Desc: prometheus.NewDesc(
292+
prometheus.BuildFQName(namespace, "os", "cgroup_cpu_cfs_period_micros"),
293+
"CPU period length in microseconds (Linux CFS bandwidth control)",
294+
defaultNodeLabels, nil,
295+
),
296+
Value: func(node NodeStatsNodeResponse) float64 {
297+
return float64(node.OS.Cgroup.CPU.CfsPeriodMicros)
298+
},
299+
Labels: defaultNodeLabelValues,
300+
},
301+
{
302+
Type: prometheus.GaugeValue,
303+
Desc: prometheus.NewDesc(
304+
prometheus.BuildFQName(namespace, "os", "cgroup_cpu_cfs_quota_micros"),
305+
"CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)",
306+
defaultNodeLabels, nil,
307+
),
308+
Value: func(node NodeStatsNodeResponse) float64 {
309+
return float64(node.OS.Cgroup.CPU.CfsQuotaMicros)
310+
},
311+
Labels: defaultNodeLabelValues,
312+
},
313+
{
314+
Type: prometheus.CounterValue,
315+
Desc: prometheus.NewDesc(
316+
prometheus.BuildFQName(namespace, "os", "cgroup_cpu_stat_number_of_times_throttled"),
317+
"Number of times the process has been throttled (Linux CFS bandwidth control)",
318+
defaultNodeLabels, nil,
319+
),
320+
Value: func(node NodeStatsNodeResponse) float64 {
321+
return float64(node.OS.Cgroup.CPU.Stat.NumberOfTimesThrottled)
322+
},
323+
Labels: defaultNodeLabelValues,
324+
},
325+
{
326+
Type: prometheus.CounterValue,
327+
Desc: prometheus.NewDesc(
328+
prometheus.BuildFQName(namespace, "os", "cgroup_cpu_stat_time_throttled_nanos"),
329+
"Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)",
330+
defaultNodeLabels, nil,
331+
),
332+
Value: func(node NodeStatsNodeResponse) float64 {
333+
return float64(node.OS.Cgroup.CPU.Stat.TimeThrottledNanos)
334+
},
335+
Labels: defaultNodeLabelValues,
336+
},
337+
{
338+
Type: prometheus.CounterValue,
339+
Desc: prometheus.NewDesc(
340+
prometheus.BuildFQName(namespace, "os", "cgroup_cpuacct_usage_nanos"),
341+
"Total CPU usage in nanoseconds (Linux cgroups CPU accounting)",
342+
defaultNodeLabels, nil,
343+
),
344+
Value: func(node NodeStatsNodeResponse) float64 {
345+
return float64(node.OS.Cgroup.CPUAcct.UsageNanos)
346+
},
347+
Labels: defaultNodeLabelValues,
348+
},
289349
{
290350
Type: prometheus.GaugeValue,
291351
Desc: prometheus.NewDesc(

collector/nodes_response.go

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313

1414
package collector
1515

16-
import "encoding/json"
16+
import (
17+
"encoding/json"
18+
)
1719

1820
// nodeStatsResponse is a representation of an Elasticsearch Node Stats
1921
type nodeStatsResponse struct {
@@ -289,10 +291,11 @@ type NodeStatsOSResponse struct {
289291
Uptime int64 `json:"uptime_in_millis"`
290292
// LoadAvg was an array of per-cpu values pre-2.0, and is a string in 2.0
291293
// Leaving this here in case we want to implement parsing logic later
292-
LoadAvg json.RawMessage `json:"load_average"`
293-
CPU NodeStatsOSCPUResponse `json:"cpu"`
294-
Mem NodeStatsOSMemResponse `json:"mem"`
295-
Swap NodeStatsOSSwapResponse `json:"swap"`
294+
LoadAvg json.RawMessage `json:"load_average"`
295+
Cgroup NodeStatsOSCgroupResponse `json:"cgroup"`
296+
CPU NodeStatsOSCPUResponse `json:"cpu"`
297+
Mem NodeStatsOSMemResponse `json:"mem"`
298+
Swap NodeStatsOSSwapResponse `json:"swap"`
296299
}
297300

298301
// NodeStatsOSMemResponse defines node stats operating system memory usage structure
@@ -322,6 +325,30 @@ type NodeStatsOSCPULoadResponse struct {
322325
Load15 float64 `json:"15m"`
323326
}
324327

328+
// NodeStatsOSCgroupResponse defines statistics related to Linux control groups (currently only CPU-related)
329+
type NodeStatsOSCgroupResponse struct {
330+
CPU NodeStatsOSCgroupCPUResponse `json:"cpu"`
331+
CPUAcct NodeStatsOCCgroupCPUAcctResponse `json:"cpuacct"`
332+
}
333+
334+
// NodeStatsOSCgroupCPUResponse represents the current CPU quota (quota value and the corresponding period), as well as the related CPU throttling stats (Linux CFS bandwidth control)
335+
type NodeStatsOSCgroupCPUResponse struct {
336+
CfsPeriodMicros int64 `json:"cfs_period_micros"`
337+
CfsQuotaMicros int64 `json:"cfs_quota_micros"`
338+
Stat NodeStatsOSCgroupCPUStatsResponse `json:"stat"`
339+
}
340+
341+
// NodeStatsOSCgroupCPUStatsResponse represents the CPU throttling stats (Linux CFS bandwidth control)
342+
type NodeStatsOSCgroupCPUStatsResponse struct {
343+
NumberOfTimesThrottled int64 `json:"number_of_times_throttled"`
344+
TimeThrottledNanos int64 `json:"time_throttled_nanos"`
345+
}
346+
347+
// NodeStatsOCCgroupCPUAcctResponse represents the Linux control groups CPU accounting stats
348+
type NodeStatsOCCgroupCPUAcctResponse struct {
349+
UsageNanos int64 `json:"usage_nanos"`
350+
}
351+
325352
// NodeStatsProcessResponse is a representation of a process statistics, memory consumption, cpu usage, open file descriptors
326353
type NodeStatsProcessResponse struct {
327354
Timestamp int64 `json:"timestamp"`

collector/nodes_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,21 @@ func TestNodesStats(t *testing.T) {
352352
elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="ml"} 0
353353
elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="remote_cluster_client"} 0
354354
elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="transform"} 0
355+
# HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control)
356+
# TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge
357+
elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
358+
# HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)
359+
# TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge
360+
elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
361+
# HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control)
362+
# TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter
363+
elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
364+
# HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)
365+
# TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter
366+
elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
367+
# HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting)
368+
# TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter
369+
elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
355370
# HELP elasticsearch_os_cpu_percent Percent CPU used by OS
356371
# TYPE elasticsearch_os_cpu_percent gauge
357372
elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 23
@@ -811,6 +826,21 @@ func TestNodesStats(t *testing.T) {
811826
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="ml"} 0
812827
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="remote_cluster_client"} 0
813828
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="transform"} 0
829+
# HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control)
830+
# TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge
831+
elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 100000
832+
# HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)
833+
# TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge
834+
elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} -1
835+
# HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control)
836+
# TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter
837+
elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 0
838+
# HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)
839+
# TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter
840+
elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 0
841+
# HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting)
842+
# TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter
843+
elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 3.3206615382e+10
814844
# HELP elasticsearch_os_cpu_percent Percent CPU used by OS
815845
# TYPE elasticsearch_os_cpu_percent gauge
816846
elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 30
@@ -1334,6 +1364,21 @@ func TestNodesStats(t *testing.T) {
13341364
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="ml"} 1
13351365
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="remote_cluster_client"} 1
13361366
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="transform"} 1
1367+
# HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control)
1368+
# TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge
1369+
elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 100000
1370+
# HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)
1371+
# TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge
1372+
elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} -1
1373+
# HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control)
1374+
# TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter
1375+
elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 0
1376+
# HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)
1377+
# TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter
1378+
elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 0
1379+
# HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting)
1380+
# TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter
1381+
elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 5.2445263941e+10
13371382
# HELP elasticsearch_os_cpu_percent Percent CPU used by OS
13381383
# TYPE elasticsearch_os_cpu_percent gauge
13391384
elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 37

0 commit comments

Comments
 (0)