Skip to content

Add cgroups CPU quota and throttling metrics #1039

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,11 @@ Further Information
| elasticsearch_jvm_memory_pool_max_bytes | counter | 3 | JVM memory max by pool |
| elasticsearch_jvm_memory_pool_peak_used_bytes | counter | 3 | JVM memory peak used by pool |
| elasticsearch_jvm_memory_pool_peak_max_bytes | counter | 3 | JVM memory peak max by pool |
| elasticsearch_os_cgroup_cpu_cfs_period_micros | gauge | 1 | CPU period length in microseconds (Linux CFS bandwidth control) |
| elasticsearch_os_cgroup_cpu_cfs_quota_micros | gauge | 1 | CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control) |
| elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled | counter | 1 | Number of times the process has been throttled (Linux CFS bandwidth control) |
| elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos | counter | 1 | Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control) |
| elasticsearch_os_cgroup_cpuacct_usage_nanos | counter | 1 | Total CPU usage in nanoseconds (Linux cgroups CPU accounting) |
| elasticsearch_os_cpu_percent | gauge | 1 | Percent CPU used by the OS |
| elasticsearch_os_load1 | gauge | 1 | Shortterm load average |
| elasticsearch_os_load5 | gauge | 1 | Midterm load average |
Expand Down
60 changes: 60 additions & 0 deletions collector/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,66 @@ func NewNodes(logger *slog.Logger, client *http.Client, url *url.URL, all bool,
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "os", "cgroup_cpu_cfs_period_micros"),
"CPU period length in microseconds (Linux CFS bandwidth control)",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.OS.Cgroup.CPU.CfsPeriodMicros)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "os", "cgroup_cpu_cfs_quota_micros"),
"CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.OS.Cgroup.CPU.CfsQuotaMicros)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.CounterValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "os", "cgroup_cpu_stat_number_of_times_throttled"),
"Number of times the process has been throttled (Linux CFS bandwidth control)",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.OS.Cgroup.CPU.Stat.NumberOfTimesThrottled)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.CounterValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "os", "cgroup_cpu_stat_time_throttled_nanos"),
"Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.OS.Cgroup.CPU.Stat.TimeThrottledNanos)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.CounterValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "os", "cgroup_cpuacct_usage_nanos"),
"Total CPU usage in nanoseconds (Linux cgroups CPU accounting)",
defaultNodeLabels, nil,
),
Value: func(node NodeStatsNodeResponse) float64 {
return float64(node.OS.Cgroup.CPUAcct.UsageNanos)
},
Labels: defaultNodeLabelValues,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
Expand Down
37 changes: 32 additions & 5 deletions collector/nodes_response.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@

package collector

import "encoding/json"
import (
"encoding/json"
)

// nodeStatsResponse is a representation of an Elasticsearch Node Stats
type nodeStatsResponse struct {
Expand Down Expand Up @@ -289,10 +291,11 @@ type NodeStatsOSResponse struct {
Uptime int64 `json:"uptime_in_millis"`
// LoadAvg was an array of per-cpu values pre-2.0, and is a string in 2.0
// Leaving this here in case we want to implement parsing logic later
LoadAvg json.RawMessage `json:"load_average"`
CPU NodeStatsOSCPUResponse `json:"cpu"`
Mem NodeStatsOSMemResponse `json:"mem"`
Swap NodeStatsOSSwapResponse `json:"swap"`
LoadAvg json.RawMessage `json:"load_average"`
Cgroup NodeStatsOSCgroupResponse `json:"cgroup"`
CPU NodeStatsOSCPUResponse `json:"cpu"`
Mem NodeStatsOSMemResponse `json:"mem"`
Swap NodeStatsOSSwapResponse `json:"swap"`
}

// NodeStatsOSMemResponse defines node stats operating system memory usage structure
Expand Down Expand Up @@ -322,6 +325,30 @@ type NodeStatsOSCPULoadResponse struct {
Load15 float64 `json:"15m"`
}

// NodeStatsOSCgroupResponse defines statistics related to Linux control groups (currently only CPU-related)
type NodeStatsOSCgroupResponse struct {
CPU NodeStatsOSCgroupCPUResponse `json:"cpu"`
CPUAcct NodeStatsOCCgroupCPUAcctResponse `json:"cpuacct"`
}

// NodeStatsOSCgroupCPUResponse represents the current CPU quota (quota value and the corresponding period), as well as the related CPU throttling stats (Linux CFS bandwidth control)
type NodeStatsOSCgroupCPUResponse struct {
CfsPeriodMicros int64 `json:"cfs_period_micros"`
CfsQuotaMicros int64 `json:"cfs_quota_micros"`
Stat NodeStatsOSCgroupCPUStatsResponse `json:"stat"`
}

// NodeStatsOSCgroupCPUStatsResponse represents the CPU throttling stats (Linux CFS bandwidth control)
type NodeStatsOSCgroupCPUStatsResponse struct {
NumberOfTimesThrottled int64 `json:"number_of_times_throttled"`
TimeThrottledNanos int64 `json:"time_throttled_nanos"`
}

// NodeStatsOCCgroupCPUAcctResponse represents the Linux control groups CPU accounting stats
type NodeStatsOCCgroupCPUAcctResponse struct {
UsageNanos int64 `json:"usage_nanos"`
}

// NodeStatsProcessResponse is a representation of a process statistics, memory consumption, cpu usage, open file descriptors
type NodeStatsProcessResponse struct {
Timestamp int64 `json:"timestamp"`
Expand Down
45 changes: 45 additions & 0 deletions collector/nodes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,21 @@ func TestNodesStats(t *testing.T) {
elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="ml"} 0
elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="remote_cluster_client"} 0
elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="transform"} 0
# HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge
elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
# HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge
elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
# HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter
elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
# HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter
elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
# HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting)
# TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter
elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0
# HELP elasticsearch_os_cpu_percent Percent CPU used by OS
# TYPE elasticsearch_os_cpu_percent gauge
elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 23
Expand Down Expand Up @@ -811,6 +826,21 @@ func TestNodesStats(t *testing.T) {
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="ml"} 0
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="remote_cluster_client"} 0
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="transform"} 0
# HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge
elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 100000
# HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge
elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} -1
# HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter
elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 0
# HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter
elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 0
# HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting)
# TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter
elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 3.3206615382e+10
# HELP elasticsearch_os_cpu_percent Percent CPU used by OS
# TYPE elasticsearch_os_cpu_percent gauge
elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 30
Expand Down Expand Up @@ -1334,6 +1364,21 @@ func TestNodesStats(t *testing.T) {
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="ml"} 1
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="remote_cluster_client"} 1
elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="transform"} 1
# HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge
elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 100000
# HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge
elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} -1
# HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter
elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 0
# HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)
# TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter
elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 0
# HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting)
# TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter
elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 5.2445263941e+10
# HELP elasticsearch_os_cpu_percent Percent CPU used by OS
# TYPE elasticsearch_os_cpu_percent gauge
elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 37
Expand Down