Enabled overall cpu stats collection and summary for local and remote runs (#314)

filipecosta90 · web-flow · commit 522841012c34 · 2022-03-14T18:34:49.000Z
* Enabled long-running benchmarks (with watchdog enforcing deletion) via timeout_secods property on benchmark definition

* Fixes per flake8 review

* Enabled overall cpu stats collection and summary for local and remote runs

* Tracking cpu usage as close as possible to start/stop time of benchmark
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "redisbench-admin"
-version = "0.7.12"
+version = "0.7.13"
 description = "Redis benchmark run helper. A wrapper around Redis and Redis Modules benchmark tools ( ftsb_redisearch, memtier_benchmark, redis-benchmark, aibench, etc... )."
 authors = ["filipecosta90 <filipecosta.90@gmail.com>","Redis Performance Group <performance@redis.com>"]
 readme = "README.md"
diff --git a/redisbench_admin/environments/oss_cluster.py b/redisbench_admin/environments/oss_cluster.py
@@ -48,6 +48,7 @@ def spin_up_local_redis_cluster(
         result = wait_for_conn(r, dataset_load_timeout_secs)
         if result is True:
             logging.info("Redis available. pid={}".format(redis_process.pid))
+            r.client_setname("redisbench-admin-cluster-#{}".format(master_shard_id))
         redis_conns.append(r)
         redis_processes.append(redis_process)
     return redis_processes, redis_conns
diff --git a/redisbench_admin/run/common.py b/redisbench_admin/run/common.py
@@ -372,11 +372,19 @@ def check_dbconfig_keyspacelen_requirement(
     required = False
     keyspacelen = None
     if dbconfig_keyname in benchmark_config:
-        for k in benchmark_config[dbconfig_keyname]:
-            if "check" in k:
-                if "keyspacelen" in k["check"]:
+        if type(benchmark_config[dbconfig_keyname]) == list:
+            for k in benchmark_config[dbconfig_keyname]:
+                if "check" in k:
+                    if "keyspacelen" in k["check"]:
+                        required = True
+                        keyspacelen = int(k["check"]["keyspacelen"])
+        if type(benchmark_config[dbconfig_keyname]) == dict:
+            if "check" in benchmark_config[dbconfig_keyname]:
+                if "keyspacelen" in benchmark_config[dbconfig_keyname]["check"]:
                     required = True
-                    keyspacelen = int(k["check"]["keyspacelen"])
+                    keyspacelen = int(
+                        benchmark_config[dbconfig_keyname]["check"]["keyspacelen"]
+                    )
     return required, keyspacelen
 
 
@@ -626,7 +634,12 @@ def common_properties_log(
 
 
 def print_results_table_stdout(
-    benchmark_config, default_metrics, results_dict, setup_name, test_name
+    benchmark_config,
+    default_metrics,
+    results_dict,
+    setup_name,
+    test_name,
+    cpu_usage=None,
 ):
     # check which metrics to extract
     (_, metrics,) = merge_default_and_config_metrics(
@@ -640,6 +653,8 @@ def print_results_table_stdout(
         "Metric Value",
     ]
     results_matrix = extract_results_table(metrics, results_dict)
+    if cpu_usage is not None:
+        results_matrix.append(["Total shards CPU usage %", "", "", cpu_usage])
     results_matrix = [[x[0], "{:.3f}".format(x[3])] for x in results_matrix]
     writer = MarkdownTableWriter(
         table_name=table_name,
diff --git a/redisbench_admin/run/metrics.py b/redisbench_admin/run/metrics.py
@@ -129,3 +129,51 @@ def collect_redis_metrics(
             kv_overall[metric_name] = metric_value
 
     return start_time_ms, res, kv_overall
+
+
+def from_info_to_overall_shard_cpu(benchmark_cpu_stats):
+    total_avg_cpu_pct = 0.0
+    res = {}
+    for shard_n, cpu_stats_arr in benchmark_cpu_stats.items():
+        # we need at least 2 elements to compute the cpu usage
+        if len(cpu_stats_arr) < 2:
+            avg_cpu_pct = None
+        else:
+            start_ts_micros = cpu_stats_arr[0]["server_time_usec"]
+            start_total_cpu = get_total_cpu(cpu_stats_arr[0])
+            end_ts_micros = cpu_stats_arr[len(cpu_stats_arr) - 1]["server_time_usec"]
+            end_total_cpu = get_total_cpu(cpu_stats_arr[len(cpu_stats_arr) - 1])
+            total_secs = (end_ts_micros - start_ts_micros) / 1000000
+            total_cpu_usage = end_total_cpu - start_total_cpu
+            avg_cpu_pct = 100.0 * (total_cpu_usage / total_secs)
+        res[shard_n] = avg_cpu_pct
+        total_avg_cpu_pct += avg_cpu_pct
+    return total_avg_cpu_pct, res
+
+
+def get_total_cpu(info_data):
+    total_cpu = 0.0
+    total_cpu = total_cpu + info_data["used_cpu_sys"]
+    total_cpu = total_cpu + info_data["used_cpu_user"]
+    return total_cpu
+
+
+BENCHMARK_RUNNING_GLOBAL = False
+BENCHMARK_CPU_STATS_GLOBAL = {}
+
+
+def collect_cpu_data(redis_conns=[], delta_secs: float = 5.0, delay_start: float = 1.0):
+    global BENCHMARK_CPU_STATS_GLOBAL
+    global BENCHMARK_RUNNING_GLOBAL
+    import time
+
+    counter = 0
+    time.sleep(delay_start)
+    while BENCHMARK_RUNNING_GLOBAL:
+        for shard_n, redis_conn in enumerate(redis_conns, 1):
+            keyname = "{}".format(shard_n)
+            if keyname not in BENCHMARK_CPU_STATS_GLOBAL:
+                BENCHMARK_CPU_STATS_GLOBAL[keyname] = []
+            BENCHMARK_CPU_STATS_GLOBAL[keyname].append(redis_conn.info())
+        time.sleep(delta_secs)
+        counter += 1
diff --git a/redisbench_admin/run_local/local_db.py b/redisbench_admin/run_local/local_db.py
@@ -117,6 +117,8 @@ def local_db_spin(
         )
 
         r = redis.Redis(port=args.port)
+        r.ping()
+        r.client_setname("redisbench-admin-stadalone")
         redis_conns.append(r)
 
     for shardn, redis_process in enumerate(redis_processes):
diff --git a/redisbench_admin/run_local/run_local.py b/redisbench_admin/run_local/run_local.py
@@ -12,6 +12,7 @@
 import traceback
 import redis
 
+import redisbench_admin.run.metrics
 from redisbench_admin.profilers.profilers_schema import (
     local_profilers_print_artifacts_table,
 )
@@ -23,6 +24,10 @@
     dso_check,
     print_results_table_stdout,
 )
+from redisbench_admin.run.metrics import (
+    from_info_to_overall_shard_cpu,
+    collect_cpu_data,
+)
 from redisbench_admin.run.redistimeseries import datasink_profile_tabular_data
 from redisbench_admin.run.run import (
     calculate_client_tool_duration_and_check,
@@ -51,6 +56,8 @@
 )
 from redisbench_admin.utils.results import post_process_benchmark_results
 
+import threading
+
 
 def run_local_command_logic(args, project_name, project_version):
     logging.info(
@@ -159,6 +166,7 @@ def run_local_command_logic(args, project_name, project_version):
                                 continue
                         if setup_type in args.allowed_envs:
                             redis_processes = []
+                            redis_conns = []
                             # after we've spinned Redis, even on error we should always teardown
                             # in case of some unexpected error we fail the test
                             # noinspection PyBroadException
@@ -270,11 +278,34 @@ def run_local_command_logic(args, project_name, project_version):
                                 )
 
                                 # run the benchmark
+                                cpu_stats_thread = threading.Thread(
+                                    target=collect_cpu_data,
+                                    args=(redis_conns, 5.0, 1.0),
+                                )
+                                redisbench_admin.run.metrics.BENCHMARK_RUNNING_GLOBAL = (
+                                    True
+                                )
+                                cpu_stats_thread.start()
                                 benchmark_start_time = datetime.datetime.now()
                                 stdout, stderr = run_local_benchmark(
                                     benchmark_tool, command
                                 )
                                 benchmark_end_time = datetime.datetime.now()
+                                redisbench_admin.run.metrics.BENCHMARK_RUNNING_GLOBAL = (
+                                    False
+                                )
+                                cpu_stats_thread.join()
+                                (
+                                    total_shards_cpu_usage,
+                                    cpu_usage_map,
+                                ) = from_info_to_overall_shard_cpu(
+                                    redisbench_admin.run.metrics.BENCHMARK_CPU_STATS_GLOBAL
+                                )
+                                logging.info(
+                                    "Total CPU usage ({:.3f} %)".format(
+                                        total_shards_cpu_usage
+                                    )
+                                )
                                 benchmark_duration_seconds = (
                                     calculate_client_tool_duration_and_check(
                                         benchmark_end_time, benchmark_start_time
@@ -339,6 +370,7 @@ def run_local_command_logic(args, project_name, project_version):
                                         results_dict,
                                         setup_name,
                                         test_name,
+                                        total_shards_cpu_usage,
                                     )
 
                                     # check KPIs
diff --git a/redisbench_admin/run_remote/remote_client.py b/redisbench_admin/run_remote/remote_client.py
@@ -5,10 +5,13 @@
 #
 import datetime
 import logging
+import threading
 
+import redisbench_admin
 from redisbench_admin.run.common import (
     prepare_benchmark_parameters,
 )
+from redisbench_admin.run.metrics import collect_cpu_data
 from redisbench_admin.run.run import calculate_client_tool_duration_and_check
 from redisbench_admin.run_remote.remote_helpers import (
     benchmark_tools_sanity_check,
@@ -46,6 +49,8 @@ def run_remote_client_tool(
     warn_min_duration,
     client_ssh_port,
     private_key,
+    collect_cpu_stats_thread=False,
+    redis_conns=[],
 ):
     (
         benchmark_min_tool_version,
@@ -105,6 +110,7 @@ def run_remote_client_tool(
         tmp = local_bench_fname
         local_bench_fname = "result.csv"
     commands = [command_str]
+    post_commands = []
     if "ann" in benchmark_tool:
         pkg_path = get_ann_remote_pkg_path(
             client_public_ip, client_ssh_port, private_key, username
@@ -132,15 +138,25 @@ def run_remote_client_tool(
         zip_results_command = "cd {} && zip -r {} results/*".format(
             results_outputdir, results_outputdir_zip
         )
-        commands.append(mkdir_command)
-        commands.append(create_website_command)
-        commands.append(zip_website_command)
-        commands.append(zip_results_command)
+        post_commands.append(mkdir_command)
+        post_commands.append(create_website_command)
+        post_commands.append(zip_website_command)
+        post_commands.append(zip_results_command)
 
         local_output_artifacts.append(website_outputdir_zip_local)
         local_output_artifacts.append(results_outputdir_zip_local)
         remote_output_artifacts.append(website_outputdir_zip)
         remote_output_artifacts.append(results_outputdir_zip)
+    cpu_stats_thread = None
+    if collect_cpu_stats_thread is True:
+        # run the benchmark
+        cpu_stats_thread = threading.Thread(
+            target=collect_cpu_data,
+            args=(redis_conns, 5.0, 1.0),
+        )
+        redisbench_admin.run.metrics.BENCHMARK_RUNNING_GLOBAL = True
+        logging.info("Starting CPU collecing thread")
+        cpu_stats_thread.start()
 
     benchmark_start_time = datetime.datetime.now()
     # run the benchmark
@@ -154,6 +170,32 @@ def run_remote_client_tool(
         client_ssh_port,
     )
     benchmark_end_time = datetime.datetime.now()
+    if cpu_stats_thread is not None:
+        logging.info("Stopping CPU collecting thread")
+        redisbench_admin.run.metrics.BENCHMARK_RUNNING_GLOBAL = False
+        cpu_stats_thread.join()
+        logging.info("CPU collecting thread stopped")
+    if len(post_commands) > 0:
+        res = execute_remote_commands(
+            client_public_ip, username, private_key, post_commands, client_ssh_port
+        )
+        recv_exit_status, _, _ = res[0]
+
+        if recv_exit_status != 0:
+            logging.error(
+                "Exit status of remote command execution {}. Printing stdout and stderr".format(
+                    recv_exit_status
+                )
+            )
+            stderr, stdout = print_commands_outputs(post_commands, True, res)
+        else:
+            logging.info(
+                "Remote process exited normally. Exit code {}. Printing stdout.".format(
+                    recv_exit_status
+                )
+            )
+            stderr, stdout = print_commands_outputs(post_commands, False, res)
+
     benchmark_duration_seconds = calculate_client_tool_duration_and_check(
         benchmark_end_time, benchmark_start_time, step_name, warn_min_duration
     )
@@ -224,15 +266,15 @@ def run_remote_client_tool(
 def setup_remote_benchmark_ann(
     client_public_ip, username, private_key, client_ssh_port
 ):
-    commands = [
-        "sudo apt install python3-pip -y",
-        "sudo pip3 install redisbench-admin>=0.7.0",
-    ]
-    # last argument (get_pty) needs to be set to true
-    # check: https://stackoverflow.com/questions/5785353/paramiko-and-sudo
-    execute_remote_commands(
-        client_public_ip, username, private_key, commands, client_ssh_port, True
-    )
+    # commands = [
+    #     "sudo apt install python3-pip -y",
+    #     "sudo pip3 install redisbench-admin>=0.7.0",
+    # ]
+    # # last argument (get_pty) needs to be set to true
+    # # check: https://stackoverflow.com/questions/5785353/paramiko-and-sudo
+    # execute_remote_commands(
+    #     client_public_ip, username, private_key, commands, client_ssh_port, True
+    # )
     pkg_path = get_ann_remote_pkg_path(
         client_public_ip, client_ssh_port, private_key, username
     )
diff --git a/redisbench_admin/run_remote/run_remote.py b/redisbench_admin/run_remote/run_remote.py
@@ -11,8 +11,11 @@
 import redis
 import pytablewriter
 from pytablewriter import MarkdownTableWriter
-
-
+import redisbench_admin.run.metrics
+from redisbench_admin.run.metrics import (
+    from_info_to_overall_shard_cpu,
+    collect_redis_metrics,
+)
 from redisbench_admin.profilers.perf_daemon_caller import (
     PerfDaemonRemoteCaller,
     PERF_DAEMON_LOGNAME,
@@ -27,7 +30,6 @@
 )
 from redisbench_admin.run.git import git_vars_crosscheck
 from redisbench_admin.run.grafana import generate_artifacts_table_grafana_redis
-from redisbench_admin.run.metrics import collect_redis_metrics
 from redisbench_admin.run.modules import redis_modules_check
 from redisbench_admin.run.redistimeseries import (
     timeseries_test_sucess_flow,
@@ -493,6 +495,8 @@ def run_remote_command_logic(args, project_name, project_version):
                                         min_recommended_benchmark_duration,
                                         client_ssh_port,
                                         private_key,
+                                        True,
+                                        redis_conns,
                                     )
 
                                     if profilers_enabled:
@@ -556,6 +560,18 @@ def run_remote_command_logic(args, project_name, project_version):
                                                 )
                                             )
 
+                                    (
+                                        total_shards_cpu_usage,
+                                        cpu_usage_map,
+                                    ) = from_info_to_overall_shard_cpu(
+                                        redisbench_admin.run.metrics.BENCHMARK_CPU_STATS_GLOBAL
+                                    )
+                                    logging.info(
+                                        "Total CPU usage ({:.3f} %)".format(
+                                            total_shards_cpu_usage
+                                        )
+                                    )
+
                                     if remote_run_result is False:
                                         db_error_artifacts(
                                             db_ssh_port,
@@ -594,6 +610,9 @@ def run_remote_command_logic(args, project_name, project_version):
                                                     ]
                                                 },
                                             )
+                                            overall_end_time_metrics[
+                                                "total_shards_used_cpu_pct"
+                                            ] = total_shards_cpu_usage
                                             expire_ms = 7 * 24 * 60 * 60 * 1000
                                             export_redis_metrics(
                                                 artifact_version,
@@ -739,6 +758,7 @@ def run_remote_command_logic(args, project_name, project_version):
                                             results_dict,
                                             setup_name,
                                             test_name,
+                                            total_shards_cpu_usage,
                                         )
                                     client_artifacts.append(local_bench_fname)
                                     client_artifacts.extend(client_output_artifacts)
diff --git a/tests/test_data/vanilla-memtier.yml b/tests/test_data/vanilla-memtier.yml

Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,8 @@ def local_db_spin(`
`117`	`117`	`)`
`118`	`118`
`119`	`119`	`r = redis.Redis(port=args.port)`
	`120`	`+ r.ping()`
	`121`	`+ r.client_setname("redisbench-admin-stadalone")`
`120`	`122`	`redis_conns.append(r)`
`121`	`123`
`122`	`124`	`for shardn, redis_process in enumerate(redis_processes):`