Skip to content

Commit 18023e7

Browse files
committed
CP-8403 add telegraf-based metric collection
Initial addition of configuration and control files to enable performance metric collection using the Telegraf agent. See also IDEA-2835 : Improving Support Bundle Performance Metrics Includes: - Service definition and startup script for "delphix-telegraf" - Modified version of "estat" adding JSON output via a "-j" option - A "perf_playbook" wrapper script to enable/disable enhanced collection - Configuration file sections (combined on startup) - Simple wrappers to facilitate parsing of "nfs_threads", "zpool iostat -o", and "zcache stats -a" outputs The service starts with a "base" set of metrics, but will include Object Storage metrics when it is detected, and will include Performance Playbook commands if that has been enabled (manually). The config is reassembled each startup. File paths intended: /opt/delphix/server/bin/delphix-telegraf-service /lib/systemd/system/delphix-telegraf.service /etc/telegraf/nfs-threads.sh /opt/delphix/server/bin/perf_playbook /etc/telegraf/telegraf.base /etc/telegraf/telegraf.inputs.dose /etc/telegraf/telegraf.inputs.playbook /etc/telegraf/zcache-stats.sh /etc/telegraf/zpool-iostat-o.sh This configuration records 4 output files (rotated on size) for main metrics, aggregate statistics (min,max,mean,stddev) and Playbook outputs to enable independent retention periods.
1 parent e20d662 commit 18023e7

11 files changed

+459
-8
lines changed

cmd/estat.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def die(*args, **kwargs):
8989
-q/-Q enable/disable latency histograms by size (default: off)
9090
-y/-Y enable/disable the summary output (default: on)
9191
-t/-T enable/disable emitting the summary total (default: on)
92+
-j set output mode to JSON
9293
-d LEVEL set BCC debug level
9394
-e emit the resulting eBPF script without executing it
9495
@@ -111,7 +112,6 @@ def die(*args, **kwargs):
111112
particular the time spent allocating a block and time spent waiting for
112113
the write I/O to complete. If POOL is not specified, defaults to tracing
113114
the pool 'domain0'.
114-
115115
"""
116116

117117

@@ -149,6 +149,7 @@ def usage(msg):
149149
script_arg = None
150150
debug_level = 0
151151
dump_bpf = False
152+
output_mode = BCCHelper.ESTAT_PRINT_MODE
152153

153154

154155
class Args:
@@ -161,6 +162,7 @@ class Args:
161162
setattr(args, "latsize_hist", False)
162163
setattr(args, "summary", True)
163164
setattr(args, "total", True)
165+
setattr(args, "json", False)
164166

165167
#
166168
# We use getopt rather than argparse because it is very difficult to get
@@ -170,7 +172,7 @@ class Args:
170172
# arguments.
171173
#
172174
try:
173-
opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLzZqQyYnNtTd:e")
175+
opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLjzZqQyYnNtTd:e")
174176
except getopt.GetoptError as err:
175177
die(err)
176178

@@ -194,6 +196,7 @@ class Args:
194196
dump_bpf = True
195197
else:
196198
switches = {'-l': "lat_hist",
199+
'-j': "json",
197200
'-z': "size_hist",
198201
'-q': "latsize_hist",
199202
'-y': "summary",
@@ -219,6 +222,9 @@ class Args:
219222
if not (args.lat_hist or args.size_hist or args.latsize_hist):
220223
args.lat_hist = True
221224

225+
if args.json:
226+
output_mode = BCCHelper.ANALYTICS_PRINT_MODE
227+
222228
# Now that we are done parsing arguments, construct the text of the BPF program
223229
try:
224230
with open(base_dir + 'bpf/estat/' + program + '.c', 'r') as prog_file:
@@ -443,7 +449,7 @@ class Args:
443449
probe_type + "'")
444450

445451
if args.lat_hist or args.size_hist or args.summary:
446-
helper1 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
452+
helper1 = BCCHelper(b, output_mode)
447453
helper1.add_key_type("name")
448454
helper1.add_key_type("axis")
449455

@@ -465,23 +471,24 @@ class Args:
465471
"bytes")
466472

467473
if args.latsize_hist:
468-
helper2 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
474+
helper2 = BCCHelper(b, output_mode)
469475
helper2.add_aggregation("latsq", BCCHelper.LL_HISTOGRAM_AGGREGATION,
470476
"microseconds")
471477
helper2.add_key_type("size")
472478
helper2.add_key_type("name")
473479
helper2.add_key_type("axis")
474480

475481
if args.summary and args.total:
476-
helper3 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
482+
helper3 = BCCHelper(b, output_mode)
477483
helper3.add_aggregation("opst", BCCHelper.COUNT_AGGREGATION, "iops(/s)")
478484
helper3.add_aggregation("datat", BCCHelper.SUM_AGGREGATION,
479485
"throughput(k/s)")
480486
helper3.add_key_type("name")
481487

482488
# Need real time;
483-
print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate this line
484-
print(" Tracing enabled... Hit Ctrl-C to end.")
489+
if not args.json:
490+
print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate line
491+
print(" Tracing enabled... Hit Ctrl-C to end.")
485492

486493
# output
487494
if monitor:
@@ -508,7 +515,8 @@ class Args:
508515
helper1.printall(clear_data)
509516
if args.summary and args.total:
510517
helper3.printall(clear_data)
511-
print("%-16s\n" % strftime("%D - %H:%M:%S %Z"))
518+
if not args.json:
519+
print("%-16s\n" % strftime("%D - %H:%M:%S %Z"))
512520
except Exception as e:
513521
die(e)
514522
else:

debian/rules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,6 @@ override_dh_auto_install:
2222
dh_install build/cmd/* /usr/bin
2323
dh_install lib/* /usr/share/performance-diagnostics/lib
2424
dh_install bpf/* /usr/share/performance-diagnostics/bpf
25+
dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin
26+
dh_install telegraf/delphix-telegraf.service /lib/systemd/system
27+
dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf

telegraf/delphix-telegraf-service

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
BASE_CONFIG=/etc/telegraf/telegraf.base
3+
DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose
4+
PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook
5+
PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED
6+
TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf
7+
8+
9+
function engine_is_object_based() {
10+
zdb -C | grep "type: 'object_store'" >/dev/null
11+
[[ "$?" == "0" ]]
12+
}
13+
14+
function playbook_is_enabled() {
15+
[[ -f $PLAYBOOK_FLAG ]]
16+
}
17+
18+
rm -f $TELEGRAF_CONFIG
19+
20+
if engine_is_object_based; then
21+
if playbook_is_enabled; then
22+
cat $PLAYBOOK_INPUTS $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
23+
else
24+
cat $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
25+
fi
26+
else
27+
if playbook_is_enabled; then
28+
cat $PLAYBOOK_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
29+
else
30+
cat $BASE_CONFIG > $TELEGRAF_CONFIG
31+
fi
32+
fi
33+
34+
/usr/bin/telegraf -config $TELEGRAF_CONFIG

telegraf/delphix-telegraf.service

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[Unit]
2+
Description=Delphix Telegraf Metric Collection Agent
3+
Documentation=https://github.com/influxdata/telegraf
4+
PartOf=delphix.target
5+
After=delphix-platform.service
6+
PartOf=delphix-platform.service
7+
8+
[Service]
9+
EnvironmentFile=-/etc/default/telegraf
10+
User=root
11+
ExecStart=/usr/bin/delphix-telegraf-service
12+
ExecReload=/bin/kill -HUP $MAINPID
13+
Restart=on-failure
14+
RestartForceExitStatus=SIGPIPE
15+
KillMode=control-group
16+
17+
[Install]
18+
WantedBy=delphix.target

telegraf/nfs-threads.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/sh
2+
nfs_threads | egrep --line-buffered -v "thr"
3+

telegraf/perf_playbook

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/bin/bash
2+
#
3+
# Copyright (c) 2021 by Delphix. All rights reserved.
4+
#
5+
# Script that enables and, disables the Performance Playbook configuration for
6+
# metric collection by Telegraf
7+
#
8+
9+
PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED
10+
11+
#
12+
# Make sure this can only be run as root.
13+
#
14+
function die() {
15+
echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2
16+
exit 1
17+
}
18+
19+
[[ $EUID -ne 0 ]] && die "must be run as root"
20+
21+
#
22+
# Process command.
23+
#
24+
25+
function usage() {
26+
echo "$(basename $0): $*" >&2
27+
echo "Usage: $(basename $0) [enable|disable]"
28+
exit 2
29+
}
30+
31+
function enable_playbook() {
32+
date
33+
echo "Enabling Performance Playbook Metrics"
34+
touch $PLAYBOOK_FLAG
35+
systemctl restart delphix-telegraf
36+
}
37+
38+
function disable_playbook() {
39+
date
40+
echo "Disabling Performance Playbook Metrics"
41+
rm -rf $PLAYBOOK_FLAG
42+
systemctl restart delphix-telegraf
43+
}
44+
45+
if [[ $# -ne 1 ]]; then
46+
usage
47+
fi
48+
49+
case "$1" in
50+
enable) enable_playbook ;;
51+
disable) disable_playbook ;;
52+
*) usage ;;
53+
esac

telegraf/telegraf.base

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Telegraf Configuration
2+
#
3+
# Configuration for telegraf agent
4+
[agent]
5+
interval = "10s"
6+
round_interval = true
7+
flush_interval = "10s"
8+
metric_batch_size = 1000
9+
metric_buffer_limit = 10000
10+
11+
###############################################################################
12+
# OUTPUT PLUGINS #
13+
###############################################################################
14+
# Define the main metric output file, excluding aggregated stats and
15+
# Performance Playbook (estat) data.
16+
[[outputs.file]]
17+
files = ["/var/log/telegraf/metrics.json"]
18+
rotation_max_size = "50MB"
19+
rotation_max_archives = 9
20+
data_format = "json"
21+
namedrop = ["*estat_*", "agg_*", "zfs", "zpool*", "zcache*"]
22+
23+
# Define output file for ZFS related metrics
24+
[[outputs.file]]
25+
files = ["/var/log/telegraf/metrics_zfs.json"]
26+
rotation_max_size = "30MB"
27+
rotation_max_archives = 5
28+
data_format = "json"
29+
namepass = ["zpool*", "zcache*", "zfs"]
30+
31+
# Define output file for Performance Playbook (estat) metrics
32+
[[outputs.file]]
33+
files = ["/var/log/telegraf/metrics_estat.json"]
34+
rotation_max_size = "30MB"
35+
rotation_max_archives = 5
36+
data_format = "json"
37+
namepass = ["*estat_*"]
38+
39+
# Define output file for aggregate statistics
40+
[[outputs.file]]
41+
files = ["/var/log/telegraf/metric_aggregates.json"]
42+
rotation_max_size = "30MB"
43+
rotation_max_archives = 5
44+
data_format = "json"
45+
namepass = ["agg_*"]
46+
47+
# Enable Live Monitoring, intended for internal use:
48+
#[[outputs.influxdb]]
49+
# urls = ["http://dbsvr.company.com:8086"]
50+
# database = "live_metrics"
51+
# skip_database_creation = true
52+
# data_format = "influx"
53+
54+
###############################################################################
55+
# INPUT PLUGINS #
56+
###############################################################################
57+
58+
# Get CPU usage
59+
[[inputs.cpu]]
60+
percpu = true
61+
totalcpu = true
62+
collect_cpu_time = false
63+
report_active = false
64+
fieldpass = ["usage*"]
65+
66+
# Get mount point stats
67+
[[inputs.disk]]
68+
mount_points = ["/","/domain0"]
69+
70+
# Get disk I/O stats
71+
[[inputs.diskio]]
72+
73+
# Track stats for the current metric files
74+
[[inputs.filestat]]
75+
files = ["/var/log/telegraf/metrics.json",
76+
"/var/log/telegraf/metrics_estat.json",
77+
"/var/log/telegraf/metrics_zfs.json",
78+
"/var/log/telegraf/metric_aggregates.json"]
79+
80+
# Get Memory stats
81+
[[inputs.mem]]
82+
83+
# Get some network interface stats
84+
[[inputs.net]]
85+
fieldpass = ["tcp*","bytes*","packets*","err*","drop*"]
86+
87+
# Track CPU and Memory for the "delphix-mgmt" service (and children).
88+
[[inputs.procstat]]
89+
systemd_unit = "delphix-mgmt.service"
90+
include_systemd_children = true
91+
namedrop = ["procstat_lookup"]
92+
fieldpass = ["memory_usage", "cpu_usage", "memory_rss"]
93+
94+
# Track CPU and Memory for the "zfs-object-agent" service (and children).
95+
[[inputs.procstat]]
96+
systemd_unit = "zfs-object-agent.service"
97+
include_systemd_children = true
98+
namedrop = ["procstat_lookup"]
99+
fieldpass = ["memory_usage", "cpu_usage", "memory_rss"]
100+
101+
# Get process counts
102+
[[inputs.processes]]
103+
104+
# Get swap memory usage
105+
[[inputs.swap]]
106+
107+
# Get misc 'other' stats (load and uptime)
108+
[[inputs.system]]
109+
110+
# ZFS kstats (arcstat, abdstat, zfetch, etc)
111+
[[inputs.zfs]]
112+
interval = "1m"
113+
114+
# Detailed ZFS pool metrics from "zpool_influxdb" (noisy)
115+
#[[inputs.exec]]
116+
# commands = ["/usr/lib/x86_64-linux-gnu/zfs/zpool_influxdb"]
117+
# data_format = "influx"
118+
119+
###############################################################################
120+
# AGGREGATION PLUGINS #
121+
###############################################################################
122+
# Filtered aggregate statistics
123+
# Calculate Min, Max, Mean, Std Deviation every hour for selected metrics:
124+
# CPU Usage (%idle)
125+
[[aggregators.basicstats]]
126+
period = "1h"
127+
drop_original = false
128+
stats = ["min", "max", "mean", "stdev"]
129+
name_prefix = "agg_"
130+
namepass = ["cpu","disk","diskio","mem","net","processes","system","swap"]
131+

0 commit comments

Comments
 (0)