diff --git a/bpf/estat/backend-io.c b/bpf/estat/backend-io.c index 3ddf679..12f6797 100644 --- a/bpf/estat/backend-io.c +++ b/bpf/estat/backend-io.c @@ -29,7 +29,6 @@ typedef struct { BPF_HASH(io_base_data, u64, io_data_t); -// @@ kprobe|blk_start_request|disk_io_start // @@ kprobe|blk_mq_start_request|disk_io_start int disk_io_start(struct pt_regs *ctx, struct request *reqp) @@ -44,7 +43,7 @@ disk_io_start(struct pt_regs *ctx, struct request *reqp) return (0); } -// @@ kprobe|blk_account_io_completion|disk_io_done +// @@ kprobe|blk_account_io_done|disk_io_done int disk_io_done(struct pt_regs *ctx, struct request *reqp) { diff --git a/bpf/estat/zvol.c b/bpf/estat/zvol.c index 23276e4..3dc170e 100644 --- a/bpf/estat/zvol.c +++ b/bpf/estat/zvol.c @@ -31,7 +31,6 @@ #define POOL (OPTARG) #endif - // Structure to hold thread local data typedef struct { u64 ts; diff --git a/bpf/stbtrace/io.st b/bpf/stbtrace/io.st index 266957a..27549d0 100755 --- a/bpf/stbtrace/io.st +++ b/bpf/stbtrace/io.st @@ -128,7 +128,7 @@ b = BPF(text=bpf_text) if BPF.get_kprobe_functions(b'blk_start_request'): b.attach_kprobe(event="blk_start_request", fn_name="disk_io_start") b.attach_kprobe(event="blk_mq_start_request", fn_name="disk_io_start") -b.attach_kprobe(event="blk_account_io_completion", fn_name="disk_io_done") +b.attach_kprobe(event="blk_account_io_done", fn_name="disk_io_done") helper = BCCHelper(b, BCCHelper.ANALYTICS_PRINT_MODE) diff --git a/bpf/stbtrace/iscsi.st b/bpf/stbtrace/iscsi.st index bb17237..d4a758f 100755 --- a/bpf/stbtrace/iscsi.st +++ b/bpf/stbtrace/iscsi.st @@ -38,8 +38,8 @@ bpf_text += """ #define OP_NAME_LEN 6 typedef struct { u64 ts; - u64 flags; u64 size; + u32 direction; } iscsi_data_t; // Key structure for scalar aggegations maps @@ -52,7 +52,8 @@ typedef struct { HIST_KEY(iscsi_hist_key_t, iscsi_key_t); -BPF_HASH(iscsi_base_data, u64, iscsi_data_t); +BPF_HASH(iscsi_start_ts, u64, u64); +BPF_HASH(iscsi_base_data, u32, iscsi_data_t); $maps:{map| BPF_HASH($map.name$, iscsi_key_t, $map.type$); }$ @@ -64,13 +65,31 @@ BPF_HASH($hist.name$, iscsi_hist_key_t, u64); int iscsi_target_start(struct pt_regs *ctx, struct iscsi_conn *conn, struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr) { - iscsi_data_t data = {}; - data.ts = bpf_ktime_get_ns(); - data.flags = hdr->flags; - data.size = hdr->data_length; - iscsi_base_data.update((u64 *) &cmd, &data); + u64 ts = bpf_ktime_get_ns(); + iscsi_start_ts.update((u64 *) &cmd, &ts); - return 0; + return (0); +} + +int iscsi_target_response(struct pt_regs *ctx, struct iscsi_conn *conn, + struct iscsi_cmd *cmd, int state) +{ + u32 tid = bpf_get_current_pid_tgid(); + iscsi_data_t data = {}; + + u64 *tsp = iscsi_start_ts.lookup((u64 *) &cmd); + if (tsp == 0) { + return (0); // missed issue + } + + data.ts = *tsp; + data.size = cmd->se_cmd.data_length; + data.direction = cmd->data_direction; + + iscsi_base_data.update(&tid, &data); + iscsi_start_ts.delete((u64 *) &cmd); + + return (0); } static int aggregate_data(iscsi_data_t *data, u64 ts, char *opstr) @@ -99,33 +118,31 @@ static int aggregate_data(iscsi_data_t *data, u64 ts, char *opstr) return 0; } -int iscsi_target_end(struct pt_regs *ctx, struct iscsi_cmd *cmd) +int iscsi_target_end(struct pt_regs *ctx) { - u64 ts = bpf_ktime_get_ns(); - iscsi_data_t *data = iscsi_base_data.lookup((u64 *) &cmd); - u64 delta; - iscsi_key_t key = {}; - char *opstr; - - if (data == 0) { - return 0; // missed issue - } - - if (data->flags & ISCSI_FLAG_CMD_READ) { - aggregate_data(data, ts, READ_STR); - } else if (data->flags & ISCSI_FLAG_CMD_WRITE) { - aggregate_data(data, ts, WRITE_STR); - } - iscsi_base_data.delete((u64 *) &cmd); - - return 0; + u64 ts = bpf_ktime_get_ns(); + u32 tid = bpf_get_current_pid_tgid(); + iscsi_data_t *data = iscsi_base_data.lookup(&tid); + + if (data == 0) { + return (0); // missed issue + } + + if (data->direction == DMA_FROM_DEVICE) { + aggregate_data(data, ts, READ_STR); + } else if (data->direction == DMA_TO_DEVICE) { + aggregate_data(data, ts, WRITE_STR); + } + iscsi_base_data.delete(&tid); + + return (0); } - """ # noqa: W293 b = BPF(text=bpf_text) b.attach_kprobe(event="iscsit_process_scsi_cmd", fn_name="iscsi_target_start") -b.attach_kprobe(event="iscsit_build_rsp_pdu", fn_name="iscsi_target_end") +b.attach_kprobe(event="iscsit_response_queue", fn_name="iscsi_target_response") +b.attach_kretprobe(event="iscsit_response_queue", fn_name="iscsi_target_end") helper = BCCHelper(b, BCCHelper.ANALYTICS_PRINT_MODE) $maps:{map| diff --git a/bpf/stbtrace/zpl.st b/bpf/stbtrace/zpl.st index c10e918..1b219d2 100644 --- a/bpf/stbtrace/zpl.st +++ b/bpf/stbtrace/zpl.st @@ -28,7 +28,8 @@ bpf_text += """ #include #include -#include +#include +#include // Definitions for this script #define READ_STR "read" @@ -67,9 +68,9 @@ BPF_HASH($hist.name$, zpl_hist_key_t, u64); }$ // Probe functions to initialize thread local data -int zfs_read_start(struct pt_regs *ctx, void *inode, uio_t *uio) +int zfs_read_start(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio, + int flags) { - u32 pid = bpf_get_current_pid_tgid(); zpl_data_t data = {}; data.ts = bpf_ktime_get_ns(); @@ -81,9 +82,9 @@ int zfs_read_start(struct pt_regs *ctx, void *inode, uio_t *uio) } // Probe functions to initialize thread local data -int zfs_write_start(struct pt_regs *ctx, void *inode, uio_t *uio) +int zfs_write_start(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio, + int flags) { - u32 pid = bpf_get_current_pid_tgid(); zpl_data_t data = {}; data.ts = bpf_ktime_get_ns(); diff --git a/cmd/estat.py b/cmd/estat.py index ebbc6ab..47d8f00 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -89,6 +89,7 @@ def die(*args, **kwargs): -q/-Q enable/disable latency histograms by size (default: off) -y/-Y enable/disable the summary output (default: on) -t/-T enable/disable emitting the summary total (default: on) + -j set output mode to JSON -d LEVEL set BCC debug level -e emit the resulting eBPF script without executing it @@ -111,7 +112,6 @@ def die(*args, **kwargs): particular the time spent allocating a block and time spent waiting for the write I/O to complete. If POOL is not specified, defaults to tracing the pool 'domain0'. - """ @@ -149,6 +149,7 @@ def usage(msg): script_arg = None debug_level = 0 dump_bpf = False +output_mode = BCCHelper.ESTAT_PRINT_MODE class Args: @@ -161,6 +162,7 @@ class Args: setattr(args, "latsize_hist", False) setattr(args, "summary", True) setattr(args, "total", True) +setattr(args, "json", False) # # We use getopt rather than argparse because it is very difficult to get @@ -170,7 +172,7 @@ class Args: # arguments. # try: - opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLzZqQyYnNtTd:e") + opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLjzZqQyYnNtTd:e") except getopt.GetoptError as err: die(err) @@ -194,6 +196,7 @@ class Args: dump_bpf = True else: switches = {'-l': "lat_hist", + '-j': "json", '-z': "size_hist", '-q': "latsize_hist", '-y': "summary", @@ -219,6 +222,9 @@ class Args: if not (args.lat_hist or args.size_hist or args.latsize_hist): args.lat_hist = True +if args.json: + output_mode = BCCHelper.ANALYTICS_PRINT_MODE + # Now that we are done parsing arguments, construct the text of the BPF program try: with open(base_dir + 'bpf/estat/' + program + '.c', 'r') as prog_file: @@ -443,7 +449,7 @@ class Args: probe_type + "'") if args.lat_hist or args.size_hist or args.summary: - helper1 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper1 = BCCHelper(b, output_mode) helper1.add_key_type("name") helper1.add_key_type("axis") @@ -465,7 +471,7 @@ class Args: "bytes") if args.latsize_hist: - helper2 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper2 = BCCHelper(b, output_mode) helper2.add_aggregation("latsq", BCCHelper.LL_HISTOGRAM_AGGREGATION, "microseconds") helper2.add_key_type("size") @@ -473,15 +479,16 @@ class Args: helper2.add_key_type("axis") if args.summary and args.total: - helper3 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper3 = BCCHelper(b, output_mode) helper3.add_aggregation("opst", BCCHelper.COUNT_AGGREGATION, "iops(/s)") helper3.add_aggregation("datat", BCCHelper.SUM_AGGREGATION, "throughput(k/s)") helper3.add_key_type("name") # Need real time; -print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate this line -print(" Tracing enabled... Hit Ctrl-C to end.") +if not args.json: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate line + print(" Tracing enabled... Hit Ctrl-C to end.") # output if monitor: @@ -508,7 +515,8 @@ class Args: helper1.printall(clear_data) if args.summary and args.total: helper3.printall(clear_data) - print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) + if not args.json: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) except Exception as e: die(e) else: diff --git a/debian/control b/debian/control index b9d7a06..fb11bac 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bcc, python3-minimal, python3-psutil +Depends: python3-bcc, python3-minimal, python3-psutil, telegraf Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. diff --git a/debian/rules b/debian/rules index d1dbb9a..3f8b76c 100755 --- a/debian/rules +++ b/debian/rules @@ -22,3 +22,6 @@ override_dh_auto_install: dh_install build/cmd/* /usr/bin dh_install lib/* /usr/share/performance-diagnostics/lib dh_install bpf/* /usr/share/performance-diagnostics/bpf + dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin + dh_install telegraf/delphix-telegraf.service /lib/systemd/system + dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf diff --git a/telegraf/delphix-telegraf-service b/telegraf/delphix-telegraf-service new file mode 100755 index 0000000..9b087e2 --- /dev/null +++ b/telegraf/delphix-telegraf-service @@ -0,0 +1,34 @@ +#!/bin/bash +BASE_CONFIG=/etc/telegraf/telegraf.base +DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose +PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook +PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED +TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf + + +function engine_is_object_based() { + zdb -C | grep "type: 'object_store'" >/dev/null + [[ "$?" == "0" ]] +} + +function playbook_is_enabled() { + [[ -f $PLAYBOOK_FLAG ]] +} + +rm -f $TELEGRAF_CONFIG + +if engine_is_object_based; then + if playbook_is_enabled; then + cat $PLAYBOOK_INPUTS $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + else + cat $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + fi +else + if playbook_is_enabled; then + cat $PLAYBOOK_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + else + cat $BASE_CONFIG > $TELEGRAF_CONFIG + fi +fi + +/usr/bin/telegraf -config $TELEGRAF_CONFIG diff --git a/telegraf/delphix-telegraf.service b/telegraf/delphix-telegraf.service new file mode 100644 index 0000000..6c767e8 --- /dev/null +++ b/telegraf/delphix-telegraf.service @@ -0,0 +1,18 @@ +[Unit] +Description=Delphix Telegraf Metric Collection Agent +Documentation=https://github.com/influxdata/telegraf +PartOf=delphix.target +After=delphix-platform.service +PartOf=delphix-platform.service + +[Service] +EnvironmentFile=-/etc/default/telegraf +User=root +ExecStart=/usr/bin/delphix-telegraf-service +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartForceExitStatus=SIGPIPE +KillMode=control-group + +[Install] +WantedBy=delphix.target diff --git a/telegraf/nfs-threads.sh b/telegraf/nfs-threads.sh new file mode 100755 index 0000000..cd530cf --- /dev/null +++ b/telegraf/nfs-threads.sh @@ -0,0 +1,3 @@ +#!/bin/sh +nfs_threads | egrep --line-buffered -v "thr" + diff --git a/telegraf/perf_playbook b/telegraf/perf_playbook new file mode 100755 index 0000000..8920963 --- /dev/null +++ b/telegraf/perf_playbook @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# +# Script that enables and, disables the Performance Playbook configuration for +# metric collection by Telegraf +# + +PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED + +# +# Make sure this can only be run as root. +# +function die() { + echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2 + exit 1 +} + +[[ $EUID -ne 0 ]] && die "must be run as root" + +# +# Process command. +# + +function usage() { + echo "$(basename $0): $*" >&2 + echo "Usage: $(basename $0) [enable|disable]" + exit 2 +} + +function enable_playbook() { + date + echo "Enabling Performance Playbook Metric Collection" + touch $PLAYBOOK_FLAG + systemctl restart delphix-telegraf +} + +function disable_playbook() { + date + echo "Disabling Performance Playbook Metric Collection" + rm -rf $PLAYBOOK_FLAG + systemctl restart delphix-telegraf +} + +if [[ $# -ne 1 ]]; then + usage +fi + +case "$1" in +enable) enable_playbook ;; +disable) disable_playbook ;; +*) usage ;; +esac diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base new file mode 100644 index 0000000..a0a1920 --- /dev/null +++ b/telegraf/telegraf.base @@ -0,0 +1,131 @@ +# Telegraf Configuration +# +# Configuration for telegraf agent +[agent] + interval = "10s" + round_interval = true + flush_interval = "10s" + metric_batch_size = 1000 + metric_buffer_limit = 10000 + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### +# Define the main metric output file, excluding aggregated stats and +# Performance Playbook (estat) data. +[[outputs.file]] + files = ["/var/log/telegraf/metrics.json"] + rotation_max_size = "50MB" + rotation_max_archives = 9 + data_format = "json" + namedrop = ["*estat_*", "agg_*", "zfs", "zpool*", "zcache*"] + +# Define output file for ZFS related metrics +[[outputs.file]] + files = ["/var/log/telegraf/metrics_zfs.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["zpool*", "zcache*", "zfs"] + +# Define output file for Performance Playbook (estat) metrics +[[outputs.file]] + files = ["/var/log/telegraf/metrics_estat.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["*estat_*"] + +# Define output file for aggregate statistics +[[outputs.file]] + files = ["/var/log/telegraf/metric_aggregates.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["agg_*"] + +# Enable Live Monitoring, intended for internal Delphix use only: +[[outputs.influxdb]] + urls = ["http://perfdb.dcol1.delphix.com:8086"] + database = "live_metrics" + skip_database_creation = true + data_format = "influx" + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Get CPU usage +[[inputs.cpu]] + percpu = true + totalcpu = true + collect_cpu_time = false + report_active = false + fieldpass = ["usage*"] + +# Get mount point stats +[[inputs.disk]] + mount_points = ["/","/domain0"] + +# Get disk I/O stats +[[inputs.diskio]] + +# Track stats for the current metric files +[[inputs.filestat]] + files = ["/var/log/telegraf/metrics.json", + "/var/log/telegraf/metrics_estat.json", + "/var/log/telegraf/metrics_zfs.json", + "/var/log/telegraf/metric_aggregates.json"] + +# Get Memory stats +[[inputs.mem]] + +# Get some network interface stats +[[inputs.net]] + fieldpass = ["tcp*","bytes*","packets*","err*","drop*"] + +# Track CPU and Memory for the "delphix-mgmt" service (and children). +[[inputs.procstat]] + systemd_unit = "delphix-mgmt.service" + include_systemd_children = true + namedrop = ["procstat_lookup"] + fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + +# Track CPU and Memory for the "zfs-object-agent" service (and children). +[[inputs.procstat]] + systemd_unit = "zfs-object-agent.service" + include_systemd_children = true + namedrop = ["procstat_lookup"] + fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + +# Get process counts +[[inputs.processes]] + +# Get swap memory usage +[[inputs.swap]] + +# Get misc 'other' stats (load and uptime) +[[inputs.system]] + +# ZFS kstats (arcstat, abdstat, zfetch, etc) +[[inputs.zfs]] + interval = "1m" + +# Detailed ZFS pool metrics from "zpool_influxdb" (noisy) +#[[inputs.exec]] +# commands = ["/usr/lib/x86_64-linux-gnu/zfs/zpool_influxdb"] +# data_format = "influx" + +############################################################################### +# AGGREGATION PLUGINS # +############################################################################### +# Filtered aggregate statistics +# Calculate Min, Max, Mean, Std Deviation every hour for selected metrics: +# CPU Usage (%idle) +[[aggregators.basicstats]] + period = "1h" + drop_original = false + stats = ["min", "max", "mean", "stdev"] + name_prefix = "agg_" + namepass = ["cpu","disk","diskio","mem","net","processes","system","swap"] + diff --git a/telegraf/telegraf.inputs.dose b/telegraf/telegraf.inputs.dose new file mode 100644 index 0000000..408018e --- /dev/null +++ b/telegraf/telegraf.inputs.dose @@ -0,0 +1,48 @@ +####################### DOSE/zcache Metrics ################################ +[[inputs.execd]] + command = ["/etc/telegraf/zcache-stats.sh"] + name_override = "zcache_stats" + signal = "none" + restart_delay = "30s" + data_format = "csv" + csv_skip_columns = 1 + csv_column_names = ["cache_lookup_count","idx_access_pendch","idx_access_entry","idx_access_chunk", + "idx_access_disk","cache_hits_count","cache_hits_bytes","cache_hits_ratio", + "cache_insert_count","cache_insert_bytes","insert_source_read","insert_source_write", + "insert_source_specr","insert_drops_buffer","insert_drops_alloc","bufbytes_used_demand", + "bufbytes_used_spec","cache_other_evicts","cache_other_pending","alloc_alloc", + "alloc_avail","alloc_free_space","alloc_free_slabs"] + csv_column_types = ["int","int","int","int","int","int","int","int","int","int","int","int","int", + "int","int","int","int","int","int","int","int","int","int"] + csv_delimiter = "\t" + csv_trim_space = true + + +[[inputs.execd]] + command = ["/etc/telegraf/zpool-iostat-o.sh"] + name_override = "zpool_iostat-o" + signal = "none" + restart_delay = "30s" + data_format = "csv" + csv_column_names = ["pool","agent_io_op_read","agent_io_op_write","agent_io_tput_read", + "agent_io_tput_write","store_data_op_get","store_data_op_put","store_data_tput_get", + "store_data_tput_put","store_metadata_op_get","store_metadata_op_put", + "store_metadata_tput_get","store_metadata_tput_put","store_reclaim_op_get", + "store_reclaim_op_put","store_reclaim_tput_get","store_reclaim_tput_put","object_del"] + csv_column_types = ["string","int","int","int","int","int","int","int","int","int","int","int","int", + "int","int","int","int","int"] + csv_tag_columns = ["pool"] + csv_delimiter = " " + csv_trim_space = true + + +[[inputs.exec]] + interval = "1h" + commands = ["/usr/sbin/zcache hits --json"] + name_override = "zcache_hits" + data_format = "json" + json_string_fields = ["start_time"] + + +# End of DOSE/zcache section + diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook new file mode 100644 index 0000000..5ed7e21 --- /dev/null +++ b/telegraf/telegraf.inputs.playbook @@ -0,0 +1,169 @@ +############################################################################## +# Performance Playbook (estat, nfs_threads) collection + +# Collect output from "estat nfs -jm 10" +[[inputs.execd]] + command = ["estat", "nfs", "-jm", "10"] + name_override = "estat_nfs" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat iscsi -jm 10" +[[inputs.execd]] + command = ["estat", "iscsi", "-jm", "10"] + name_override = "estat_iscsi" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zpl -jm 10" +[[inputs.execd]] + command = ["estat", "zpl", "-jm", "10"] + name_override = "estat_zpl" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat backend-io -jm 10" +[[inputs.execd]] + command = ["estat", "backend-io", "-jm", "10"] + name_override = "estat_backend-io" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zvol -jm 10" +[[inputs.execd]] + command = ["estat", "zvol", "-jm", "10"] + name_override = "estat_zvol" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zio-queue -jm 10" +[[inputs.execd]] + command = ["estat", "zio-queue", "-jm", "10"] + name_override = "estat_zio-queue" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zio -jm 10" +[[inputs.execd]] + command = ["estat", "zio", "-jm", "10"] + name_override = "estat_zio" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat metaslab-alloc -jm 10" +[[inputs.execd]] + command = ["estat", "metaslab-alloc", "-jm", "10"] + name_override = "estat_metaslab-alloc" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "nfs_threads" +[[inputs.execd]] + command = ["/etc/telegraf/nfs-threads.sh"] + name_override = "nfs_threads" + signal = "none" + restart_delay = "30s" + data_format = "csv" + csv_skip_columns = 2 + csv_column_names = ["packets","sockets","woken","used","metadata","riops","rtput","wiops","wtput"] + csv_column_types = ["int", "int", "int", "int", "int", "float","string","float","string"] + csv_delimiter = " " + csv_trim_space = true + +# End of Playbook section +############################################################################## + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### +# Convert strings from estat into integer values so they don't get dropped +[[processors.converter]] + [processors.converter.fields] + integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] + +# The estat output contains a nested latency histogram, so we need to +# parse that out as a new array metric rather than a non-JSON string. +# +# From this: +# "microseconds":"{20000,5},{30000,15},{40000,3},{50000,24}" +# to this: +# "microseconds":"{20000:5,30000:15,40000:3,50000:24}" +# +# Clone the original so we have a "new" metric with a "hist_" name prefix +[[processors.clone]] + order = 1 + name_prefix = "hist_" + namepass = ["estat_*"] + +# Rewrite the histograms for the "hist_estat_*" metrics as JSON objects +[[processors.regex]] + order = 2 + namepass = ["hist_estat_*"] + [[processors.regex.fields]] + key = "microseconds" + pattern = "{(\\d+),(\\d+)}" + replacement = "\"${1}\":${2}" + [[processors.regex.fields]] + key = "microseconds" + pattern = ".*" + replacement = "{$0}" + +# Now parse out the arrays for "hist_estat_*" metrics +[[processors.parser]] + order = 3 + merge = "override" + parse_fields = ["microseconds"] + drop_original = false + data_format = "json" + namepass = ["hist_estat_*"] + fieldpass = ["microseconds"] + +# End of Processor section +############################################################################## diff --git a/telegraf/zcache-stats.sh b/telegraf/zcache-stats.sh new file mode 100755 index 0000000..dbe8101 --- /dev/null +++ b/telegraf/zcache-stats.sh @@ -0,0 +1,3 @@ +#!/bin/bash +zcache stats -ap 10 | egrep --line-buffered -v "\-" + diff --git a/telegraf/zpool-iostat-o.sh b/telegraf/zpool-iostat-o.sh new file mode 100755 index 0000000..6f309d4 --- /dev/null +++ b/telegraf/zpool-iostat-o.sh @@ -0,0 +1,3 @@ +#!/bin/bash +zpool iostat -opy domain0 10 | egrep --line-buffered -v "object|put|\-" +