diff --git a/README.md b/README.md index f813534d..9fa64a85 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,11 @@ Further Information | elasticsearch_jvm_memory_pool_max_bytes | counter | 3 | JVM memory max by pool | | elasticsearch_jvm_memory_pool_peak_used_bytes | counter | 3 | JVM memory peak used by pool | | elasticsearch_jvm_memory_pool_peak_max_bytes | counter | 3 | JVM memory peak max by pool | +| elasticsearch_os_cgroup_cpu_cfs_period_micros | gauge | 1 | CPU period length in microseconds (Linux CFS bandwidth control) | +| elasticsearch_os_cgroup_cpu_cfs_quota_micros | gauge | 1 | CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control) | +| elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled | counter | 1 | Number of times the process has been throttled (Linux CFS bandwidth control) | +| elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos | counter | 1 | Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control) | +| elasticsearch_os_cgroup_cpuacct_usage_nanos | counter | 1 | Total CPU usage in nanoseconds (Linux cgroups CPU accounting) | | elasticsearch_os_cpu_percent | gauge | 1 | Percent CPU used by the OS | | elasticsearch_os_load1 | gauge | 1 | Shortterm load average | | elasticsearch_os_load5 | gauge | 1 | Midterm load average | diff --git a/collector/nodes.go b/collector/nodes.go index 527870f9..9f33b2de 100644 --- a/collector/nodes.go +++ b/collector/nodes.go @@ -286,6 +286,66 @@ func NewNodes(logger *slog.Logger, client *http.Client, url *url.URL, all bool, }, Labels: defaultNodeLabelValues, }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "os", "cgroup_cpu_cfs_period_micros"), + "CPU period length in microseconds (Linux CFS bandwidth control)", + defaultNodeLabels, nil, + ), + Value: func(node NodeStatsNodeResponse) float64 { + return float64(node.OS.Cgroup.CPU.CfsPeriodMicros) + }, + Labels: defaultNodeLabelValues, + }, + { + Type: prometheus.GaugeValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "os", "cgroup_cpu_cfs_quota_micros"), + "CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control)", + defaultNodeLabels, nil, + ), + Value: func(node NodeStatsNodeResponse) float64 { + return float64(node.OS.Cgroup.CPU.CfsQuotaMicros) + }, + Labels: defaultNodeLabelValues, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "os", "cgroup_cpu_stat_number_of_times_throttled"), + "Number of times the process has been throttled (Linux CFS bandwidth control)", + defaultNodeLabels, nil, + ), + Value: func(node NodeStatsNodeResponse) float64 { + return float64(node.OS.Cgroup.CPU.Stat.NumberOfTimesThrottled) + }, + Labels: defaultNodeLabelValues, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "os", "cgroup_cpu_stat_time_throttled_nanos"), + "Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control)", + defaultNodeLabels, nil, + ), + Value: func(node NodeStatsNodeResponse) float64 { + return float64(node.OS.Cgroup.CPU.Stat.TimeThrottledNanos) + }, + Labels: defaultNodeLabelValues, + }, + { + Type: prometheus.CounterValue, + Desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "os", "cgroup_cpuacct_usage_nanos"), + "Total CPU usage in nanoseconds (Linux cgroups CPU accounting)", + defaultNodeLabels, nil, + ), + Value: func(node NodeStatsNodeResponse) float64 { + return float64(node.OS.Cgroup.CPUAcct.UsageNanos) + }, + Labels: defaultNodeLabelValues, + }, { Type: prometheus.GaugeValue, Desc: prometheus.NewDesc( diff --git a/collector/nodes_response.go b/collector/nodes_response.go index 6ba2ad7b..f73e7af9 100644 --- a/collector/nodes_response.go +++ b/collector/nodes_response.go @@ -13,7 +13,9 @@ package collector -import "encoding/json" +import ( + "encoding/json" +) // nodeStatsResponse is a representation of an Elasticsearch Node Stats type nodeStatsResponse struct { @@ -289,10 +291,11 @@ type NodeStatsOSResponse struct { Uptime int64 `json:"uptime_in_millis"` // LoadAvg was an array of per-cpu values pre-2.0, and is a string in 2.0 // Leaving this here in case we want to implement parsing logic later - LoadAvg json.RawMessage `json:"load_average"` - CPU NodeStatsOSCPUResponse `json:"cpu"` - Mem NodeStatsOSMemResponse `json:"mem"` - Swap NodeStatsOSSwapResponse `json:"swap"` + LoadAvg json.RawMessage `json:"load_average"` + Cgroup NodeStatsOSCgroupResponse `json:"cgroup"` + CPU NodeStatsOSCPUResponse `json:"cpu"` + Mem NodeStatsOSMemResponse `json:"mem"` + Swap NodeStatsOSSwapResponse `json:"swap"` } // NodeStatsOSMemResponse defines node stats operating system memory usage structure @@ -322,6 +325,30 @@ type NodeStatsOSCPULoadResponse struct { Load15 float64 `json:"15m"` } +// NodeStatsOSCgroupResponse defines statistics related to Linux control groups (currently only CPU-related) +type NodeStatsOSCgroupResponse struct { + CPU NodeStatsOSCgroupCPUResponse `json:"cpu"` + CPUAcct NodeStatsOCCgroupCPUAcctResponse `json:"cpuacct"` +} + +// NodeStatsOSCgroupCPUResponse represents the current CPU quota (quota value and the corresponding period), as well as the related CPU throttling stats (Linux CFS bandwidth control) +type NodeStatsOSCgroupCPUResponse struct { + CfsPeriodMicros int64 `json:"cfs_period_micros"` + CfsQuotaMicros int64 `json:"cfs_quota_micros"` + Stat NodeStatsOSCgroupCPUStatsResponse `json:"stat"` +} + +// NodeStatsOSCgroupCPUStatsResponse represents the CPU throttling stats (Linux CFS bandwidth control) +type NodeStatsOSCgroupCPUStatsResponse struct { + NumberOfTimesThrottled int64 `json:"number_of_times_throttled"` + TimeThrottledNanos int64 `json:"time_throttled_nanos"` +} + +// NodeStatsOCCgroupCPUAcctResponse represents the Linux control groups CPU accounting stats +type NodeStatsOCCgroupCPUAcctResponse struct { + UsageNanos int64 `json:"usage_nanos"` +} + // NodeStatsProcessResponse is a representation of a process statistics, memory consumption, cpu usage, open file descriptors type NodeStatsProcessResponse struct { Timestamp int64 `json:"timestamp"` diff --git a/collector/nodes_test.go b/collector/nodes_test.go index 9e731837..d6fe1e68 100644 --- a/collector/nodes_test.go +++ b/collector/nodes_test.go @@ -352,6 +352,21 @@ func TestNodesStats(t *testing.T) { elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="ml"} 0 elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="remote_cluster_client"} 0 elasticsearch_nodes_roles{cluster="elasticsearch",host="127.0.0.1",name="bVrN1Hx",role="transform"} 0 + # HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge + elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0 + # HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge + elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0 + # HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter + elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0 + # HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter + elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0 + # HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting) + # TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter + elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 0 # HELP elasticsearch_os_cpu_percent Percent CPU used by OS # TYPE elasticsearch_os_cpu_percent gauge elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="127.0.0.1",name="bVrN1Hx"} 23 @@ -811,6 +826,21 @@ func TestNodesStats(t *testing.T) { elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="ml"} 0 elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="remote_cluster_client"} 0 elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="9_P7yui",role="transform"} 0 + # HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge + elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 100000 + # HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge + elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} -1 + # HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter + elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 0 + # HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter + elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 0 + # HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting) + # TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter + elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 3.3206615382e+10 # HELP elasticsearch_os_cpu_percent Percent CPU used by OS # TYPE elasticsearch_os_cpu_percent gauge elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="9_P7yui"} 30 @@ -1334,6 +1364,21 @@ func TestNodesStats(t *testing.T) { elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="ml"} 1 elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="remote_cluster_client"} 1 elasticsearch_nodes_roles{cluster="elasticsearch",host="172.17.0.2",name="aaf5a8a0bceb",role="transform"} 1 + # HELP elasticsearch_os_cgroup_cpu_cfs_period_micros CPU period length in microseconds (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_cfs_period_micros gauge + elasticsearch_os_cgroup_cpu_cfs_period_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 100000 + # HELP elasticsearch_os_cgroup_cpu_cfs_quota_micros CPU quota per CPU period (cgroup_cfs_period_micros) in microseconds (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_cfs_quota_micros gauge + elasticsearch_os_cgroup_cpu_cfs_quota_micros{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} -1 + # HELP elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled Number of times the process has been throttled (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled counter + elasticsearch_os_cgroup_cpu_stat_number_of_times_throttled{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 0 + # HELP elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos Total time duration (in nanoseconds) for which the process has been throttled (Linux CFS bandwidth control) + # TYPE elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos counter + elasticsearch_os_cgroup_cpu_stat_time_throttled_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 0 + # HELP elasticsearch_os_cgroup_cpuacct_usage_nanos Total CPU usage in nanoseconds (Linux cgroups CPU accounting) + # TYPE elasticsearch_os_cgroup_cpuacct_usage_nanos counter + elasticsearch_os_cgroup_cpuacct_usage_nanos{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 5.2445263941e+10 # HELP elasticsearch_os_cpu_percent Percent CPU used by OS # TYPE elasticsearch_os_cpu_percent gauge elasticsearch_os_cpu_percent{cluster="elasticsearch",es_client_node="true",es_data_node="true",es_ingest_node="true",es_master_node="true",host="172.17.0.2",name="aaf5a8a0bceb"} 37