Skip to content

Commit 92b0686

Browse files
authored
Merge pull request #81 from scottmdlpx/master
CP-8403 Adding Telegraf-based metric collection.
2 parents e20d662 + 18023e7 commit 92b0686

11 files changed

+459
-8
lines changed

cmd/estat.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def die(*args, **kwargs):
8989
-q/-Q enable/disable latency histograms by size (default: off)
9090
-y/-Y enable/disable the summary output (default: on)
9191
-t/-T enable/disable emitting the summary total (default: on)
92+
-j set output mode to JSON
9293
-d LEVEL set BCC debug level
9394
-e emit the resulting eBPF script without executing it
9495
@@ -111,7 +112,6 @@ def die(*args, **kwargs):
111112
particular the time spent allocating a block and time spent waiting for
112113
the write I/O to complete. If POOL is not specified, defaults to tracing
113114
the pool 'domain0'.
114-
115115
"""
116116

117117

@@ -149,6 +149,7 @@ def usage(msg):
149149
script_arg = None
150150
debug_level = 0
151151
dump_bpf = False
152+
output_mode = BCCHelper.ESTAT_PRINT_MODE
152153

153154

154155
class Args:
@@ -161,6 +162,7 @@ class Args:
161162
setattr(args, "latsize_hist", False)
162163
setattr(args, "summary", True)
163164
setattr(args, "total", True)
165+
setattr(args, "json", False)
164166

165167
#
166168
# We use getopt rather than argparse because it is very difficult to get
@@ -170,7 +172,7 @@ class Args:
170172
# arguments.
171173
#
172174
try:
173-
opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLzZqQyYnNtTd:e")
175+
opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLjzZqQyYnNtTd:e")
174176
except getopt.GetoptError as err:
175177
die(err)
176178

@@ -194,6 +196,7 @@ class Args:
194196
dump_bpf = True
195197
else:
196198
switches = {'-l': "lat_hist",
199+
'-j': "json",
197200
'-z': "size_hist",
198201
'-q': "latsize_hist",
199202
'-y': "summary",
@@ -219,6 +222,9 @@ class Args:
219222
if not (args.lat_hist or args.size_hist or args.latsize_hist):
220223
args.lat_hist = True
221224

225+
if args.json:
226+
output_mode = BCCHelper.ANALYTICS_PRINT_MODE
227+
222228
# Now that we are done parsing arguments, construct the text of the BPF program
223229
try:
224230
with open(base_dir + 'bpf/estat/' + program + '.c', 'r') as prog_file:
@@ -443,7 +449,7 @@ class Args:
443449
probe_type + "'")
444450

445451
if args.lat_hist or args.size_hist or args.summary:
446-
helper1 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
452+
helper1 = BCCHelper(b, output_mode)
447453
helper1.add_key_type("name")
448454
helper1.add_key_type("axis")
449455

@@ -465,23 +471,24 @@ class Args:
465471
"bytes")
466472

467473
if args.latsize_hist:
468-
helper2 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
474+
helper2 = BCCHelper(b, output_mode)
469475
helper2.add_aggregation("latsq", BCCHelper.LL_HISTOGRAM_AGGREGATION,
470476
"microseconds")
471477
helper2.add_key_type("size")
472478
helper2.add_key_type("name")
473479
helper2.add_key_type("axis")
474480

475481
if args.summary and args.total:
476-
helper3 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE)
482+
helper3 = BCCHelper(b, output_mode)
477483
helper3.add_aggregation("opst", BCCHelper.COUNT_AGGREGATION, "iops(/s)")
478484
helper3.add_aggregation("datat", BCCHelper.SUM_AGGREGATION,
479485
"throughput(k/s)")
480486
helper3.add_key_type("name")
481487

482488
# Need real time;
483-
print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate this line
484-
print(" Tracing enabled... Hit Ctrl-C to end.")
489+
if not args.json:
490+
print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate line
491+
print(" Tracing enabled... Hit Ctrl-C to end.")
485492

486493
# output
487494
if monitor:
@@ -508,7 +515,8 @@ class Args:
508515
helper1.printall(clear_data)
509516
if args.summary and args.total:
510517
helper3.printall(clear_data)
511-
print("%-16s\n" % strftime("%D - %H:%M:%S %Z"))
518+
if not args.json:
519+
print("%-16s\n" % strftime("%D - %H:%M:%S %Z"))
512520
except Exception as e:
513521
die(e)
514522
else:

debian/rules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,6 @@ override_dh_auto_install:
2222
dh_install build/cmd/* /usr/bin
2323
dh_install lib/* /usr/share/performance-diagnostics/lib
2424
dh_install bpf/* /usr/share/performance-diagnostics/bpf
25+
dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin
26+
dh_install telegraf/delphix-telegraf.service /lib/systemd/system
27+
dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf

telegraf/delphix-telegraf-service

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
BASE_CONFIG=/etc/telegraf/telegraf.base
3+
DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose
4+
PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook
5+
PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED
6+
TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf
7+
8+
9+
function engine_is_object_based() {
10+
zdb -C | grep "type: 'object_store'" >/dev/null
11+
[[ "$?" == "0" ]]
12+
}
13+
14+
function playbook_is_enabled() {
15+
[[ -f $PLAYBOOK_FLAG ]]
16+
}
17+
18+
rm -f $TELEGRAF_CONFIG
19+
20+
if engine_is_object_based; then
21+
if playbook_is_enabled; then
22+
cat $PLAYBOOK_INPUTS $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
23+
else
24+
cat $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
25+
fi
26+
else
27+
if playbook_is_enabled; then
28+
cat $PLAYBOOK_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG
29+
else
30+
cat $BASE_CONFIG > $TELEGRAF_CONFIG
31+
fi
32+
fi
33+
34+
/usr/bin/telegraf -config $TELEGRAF_CONFIG

telegraf/delphix-telegraf.service

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[Unit]
2+
Description=Delphix Telegraf Metric Collection Agent
3+
Documentation=https://github.com/influxdata/telegraf
4+
PartOf=delphix.target
5+
After=delphix-platform.service
6+
PartOf=delphix-platform.service
7+
8+
[Service]
9+
EnvironmentFile=-/etc/default/telegraf
10+
User=root
11+
ExecStart=/usr/bin/delphix-telegraf-service
12+
ExecReload=/bin/kill -HUP $MAINPID
13+
Restart=on-failure
14+
RestartForceExitStatus=SIGPIPE
15+
KillMode=control-group
16+
17+
[Install]
18+
WantedBy=delphix.target

telegraf/nfs-threads.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/sh
2+
nfs_threads | egrep --line-buffered -v "thr"
3+

telegraf/perf_playbook

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/bin/bash
2+
#
3+
# Copyright (c) 2021 by Delphix. All rights reserved.
4+
#
5+
# Script that enables and, disables the Performance Playbook configuration for
6+
# metric collection by Telegraf
7+
#
8+
9+
PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED
10+
11+
#
12+
# Make sure this can only be run as root.
13+
#
14+
function die() {
15+
echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2
16+
exit 1
17+
}
18+
19+
[[ $EUID -ne 0 ]] && die "must be run as root"
20+
21+
#
22+
# Process command.
23+
#
24+
25+
function usage() {
26+
echo "$(basename $0): $*" >&2
27+
echo "Usage: $(basename $0) [enable|disable]"
28+
exit 2
29+
}
30+
31+
function enable_playbook() {
32+
date
33+
echo "Enabling Performance Playbook Metrics"
34+
touch $PLAYBOOK_FLAG
35+
systemctl restart delphix-telegraf
36+
}
37+
38+
function disable_playbook() {
39+
date
40+
echo "Disabling Performance Playbook Metrics"
41+
rm -rf $PLAYBOOK_FLAG
42+
systemctl restart delphix-telegraf
43+
}
44+
45+
if [[ $# -ne 1 ]]; then
46+
usage
47+
fi
48+
49+
case "$1" in
50+
enable) enable_playbook ;;
51+
disable) disable_playbook ;;
52+
*) usage ;;
53+
esac

telegraf/telegraf.base

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Telegraf Configuration
2+
#
3+
# Configuration for telegraf agent
4+
[agent]
5+
interval = "10s"
6+
round_interval = true
7+
flush_interval = "10s"
8+
metric_batch_size = 1000
9+
metric_buffer_limit = 10000
10+
11+
###############################################################################
12+
# OUTPUT PLUGINS #
13+
###############################################################################
14+
# Define the main metric output file, excluding aggregated stats and
15+
# Performance Playbook (estat) data.
16+
[[outputs.file]]
17+
files = ["/var/log/telegraf/metrics.json"]
18+
rotation_max_size = "50MB"
19+
rotation_max_archives = 9
20+
data_format = "json"
21+
namedrop = ["*estat_*", "agg_*", "zfs", "zpool*", "zcache*"]
22+
23+
# Define output file for ZFS related metrics
24+
[[outputs.file]]
25+
files = ["/var/log/telegraf/metrics_zfs.json"]
26+
rotation_max_size = "30MB"
27+
rotation_max_archives = 5
28+
data_format = "json"
29+
namepass = ["zpool*", "zcache*", "zfs"]
30+
31+
# Define output file for Performance Playbook (estat) metrics
32+
[[outputs.file]]
33+
files = ["/var/log/telegraf/metrics_estat.json"]
34+
rotation_max_size = "30MB"
35+
rotation_max_archives = 5
36+
data_format = "json"
37+
namepass = ["*estat_*"]
38+
39+
# Define output file for aggregate statistics
40+
[[outputs.file]]
41+
files = ["/var/log/telegraf/metric_aggregates.json"]
42+
rotation_max_size = "30MB"
43+
rotation_max_archives = 5
44+
data_format = "json"
45+
namepass = ["agg_*"]
46+
47+
# Enable Live Monitoring, intended for internal use:
48+
#[[outputs.influxdb]]
49+
# urls = ["http://dbsvr.company.com:8086"]
50+
# database = "live_metrics"
51+
# skip_database_creation = true
52+
# data_format = "influx"
53+
54+
###############################################################################
55+
# INPUT PLUGINS #
56+
###############################################################################
57+
58+
# Get CPU usage
59+
[[inputs.cpu]]
60+
percpu = true
61+
totalcpu = true
62+
collect_cpu_time = false
63+
report_active = false
64+
fieldpass = ["usage*"]
65+
66+
# Get mount point stats
67+
[[inputs.disk]]
68+
mount_points = ["/","/domain0"]
69+
70+
# Get disk I/O stats
71+
[[inputs.diskio]]
72+
73+
# Track stats for the current metric files
74+
[[inputs.filestat]]
75+
files = ["/var/log/telegraf/metrics.json",
76+
"/var/log/telegraf/metrics_estat.json",
77+
"/var/log/telegraf/metrics_zfs.json",
78+
"/var/log/telegraf/metric_aggregates.json"]
79+
80+
# Get Memory stats
81+
[[inputs.mem]]
82+
83+
# Get some network interface stats
84+
[[inputs.net]]
85+
fieldpass = ["tcp*","bytes*","packets*","err*","drop*"]
86+
87+
# Track CPU and Memory for the "delphix-mgmt" service (and children).
88+
[[inputs.procstat]]
89+
systemd_unit = "delphix-mgmt.service"
90+
include_systemd_children = true
91+
namedrop = ["procstat_lookup"]
92+
fieldpass = ["memory_usage", "cpu_usage", "memory_rss"]
93+
94+
# Track CPU and Memory for the "zfs-object-agent" service (and children).
95+
[[inputs.procstat]]
96+
systemd_unit = "zfs-object-agent.service"
97+
include_systemd_children = true
98+
namedrop = ["procstat_lookup"]
99+
fieldpass = ["memory_usage", "cpu_usage", "memory_rss"]
100+
101+
# Get process counts
102+
[[inputs.processes]]
103+
104+
# Get swap memory usage
105+
[[inputs.swap]]
106+
107+
# Get misc 'other' stats (load and uptime)
108+
[[inputs.system]]
109+
110+
# ZFS kstats (arcstat, abdstat, zfetch, etc)
111+
[[inputs.zfs]]
112+
interval = "1m"
113+
114+
# Detailed ZFS pool metrics from "zpool_influxdb" (noisy)
115+
#[[inputs.exec]]
116+
# commands = ["/usr/lib/x86_64-linux-gnu/zfs/zpool_influxdb"]
117+
# data_format = "influx"
118+
119+
###############################################################################
120+
# AGGREGATION PLUGINS #
121+
###############################################################################
122+
# Filtered aggregate statistics
123+
# Calculate Min, Max, Mean, Std Deviation every hour for selected metrics:
124+
# CPU Usage (%idle)
125+
[[aggregators.basicstats]]
126+
period = "1h"
127+
drop_original = false
128+
stats = ["min", "max", "mean", "stdev"]
129+
name_prefix = "agg_"
130+
namepass = ["cpu","disk","diskio","mem","net","processes","system","swap"]
131+

0 commit comments

Comments
 (0)