standardize compare tool to allow for oss data related comparisons (#322)

filipecosta90 · web-flow · commit b2ec03a74751 · 2022-03-16T11:03:52.000Z
* Remote result post process fix

* Remote result post process fix

* compare OSS WIP

* Bumping version from 0.7.16 to 0.7.17

* Fixed flake8 error
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "redisbench-admin"
-version = "0.7.16"
+version = "0.7.17"
 description = "Redis benchmark run helper. A wrapper around Redis and Redis Modules benchmark tools ( ftsb_redisearch, memtier_benchmark, redis-benchmark, aibench, etc... )."
 authors = ["filipecosta90 <filipecosta.90@gmail.com>","Redis Performance Group <performance@redis.com>"]
 readme = "README.md"
diff --git a/redisbench_admin/compare/args.py b/redisbench_admin/compare/args.py
@@ -52,6 +52,9 @@ def create_compare_arguments(parser):
     parser.add_argument("--comparison-branch", type=str, default=None, required=False)
     parser.add_argument("--comparison-tag", type=str, default=None, required=False)
     parser.add_argument("--print-regressions-only", type=bool, default=False)
+    parser.add_argument("--verbose", type=bool, default=False)
+    parser.add_argument("--use_metric_context_path", type=bool, default=False)
+    parser.add_argument("--testname_regex", type=str, default=".*", required=False)
     parser.add_argument(
         "--regressions-percent-lower-limit",
         type=float,
diff --git a/redisbench_admin/compare/compare.py b/redisbench_admin/compare/compare.py
@@ -4,18 +4,17 @@
 #  All rights reserved.
 #
 import logging
+import re
+
 import pandas as pd
 import redis
 from pytablewriter import MarkdownTableWriter
 import humanize
 import datetime as dt
-
+from tqdm import tqdm
 from redisbench_admin.utils.remote import get_overall_dashboard_keynames
 
 
-from redisbench_admin.utils.utils import get_ts_metric_name
-
-
 def compare_command_logic(args, project_name, project_version):
     logging.info(
         "Using: {project_name} {project_version}".format(
@@ -99,29 +98,46 @@ def compare_command_logic(args, project_name, project_version):
         _,
         _,
         _,
-        _,
+        testcases_metric_context_path_setname,
         _,
         _,
         _,
         _,
         _,
     ) = get_overall_dashboard_keynames(tf_github_org, tf_github_repo, tf_triggering_env)
     test_names = []
+    used_key = testcases_setname
+    test_filter = "test_name"
+
+    if args.use_metric_context_path:
+        test_filter = "test_name:metric_context_path"
+        used_key = testcases_metric_context_path_setname
+
+    tags_regex_string = re.compile(args.testname_regex)
+
     try:
-        test_names = rts.smembers(testcases_setname)
+        test_names = rts.smembers(used_key)
         test_names = list(test_names)
         test_names.sort()
+        final_test_names = []
+        for test_name in test_names:
+            test_name = test_name.decode()
+            match_obj = re.search(tags_regex_string, test_name)
+            if match_obj is not None:
+                final_test_names.append(test_name)
+        test_names = final_test_names
+
     except redis.exceptions.ResponseError as e:
         logging.warning(
             "Error while trying to fetch test cases set (key={}) {}. ".format(
-                testcases_setname, e.__str__()
+                used_key, e.__str__()
             )
         )
         pass
 
     logging.warning(
-        "Based on test-cases set (key={}) we have {} distinct benchmarks. ".format(
-            testcases_setname, len(test_names)
+        "Based on test-cases set (key={}) we have {} comparison points. ".format(
+            used_key, len(test_names)
         )
     )
     profilers_artifacts_matrix = []
@@ -131,31 +147,42 @@ def compare_command_logic(args, project_name, project_version):
     total_unstable = 0
     total_regressions = 0
     noise_waterline = 2.5
+    progress = tqdm(unit="benchmark time-series", total=len(test_names))
     for test_name in test_names:
+        filters_baseline = [
+            "{}={}".format(by_str, baseline_str),
+            "metric={}".format(metric_name),
+            "{}={}".format(test_filter, test_name),
+            "deployment_type={}".format(deployment_type),
+            "deployment_name={}".format(deployment_name),
+        ]
+        filters_comparison = [
+            "{}={}".format(by_str, comparison_str),
+            "metric={}".format(metric_name),
+            "{}={}".format(test_filter, test_name),
+            "deployment_type={}".format(deployment_type),
+            "deployment_name={}".format(deployment_name),
+        ]
+        baseline_timeseries = rts.ts().queryindex(filters_baseline)
+        comparison_timeseries = rts.ts().queryindex(filters_comparison)
+        progress.update()
+        if len(baseline_timeseries) != 1:
+            if args.verbose:
+                logging.warning(
+                    "Baseline timeseries {}".format(len(baseline_timeseries))
+                )
+            continue
+        else:
+            ts_name_baseline = baseline_timeseries[0]
+        if len(comparison_timeseries) != 1:
+            if args.verbose:
+                logging.warning(
+                    "Comparison timeseries {}".format(len(comparison_timeseries))
+                )
+            continue
+        else:
+            ts_name_comparison = comparison_timeseries[0]
 
-        test_name = test_name.decode()
-        ts_name_baseline = get_ts_metric_name(
-            "by.{}".format(by_str),
-            baseline_str,
-            tf_github_org,
-            tf_github_repo,
-            deployment_name,
-            deployment_type,
-            test_name,
-            tf_triggering_env,
-            metric_name,
-        )
-        ts_name_comparison = get_ts_metric_name(
-            "by.{}".format(by_str),
-            comparison_str,
-            tf_github_org,
-            tf_github_repo,
-            deployment_name,
-            deployment_type,
-            test_name,
-            tf_triggering_env,
-            metric_name,
-        )
         baseline_v = "N/A"
         comparison_v = "N/A"
         baseline_nsamples = 0
@@ -223,14 +250,14 @@ def compare_command_logic(args, project_name, project_version):
                 unstable = True
             if baseline_pct_change > 10.0:
                 stamp_b = "UNSTABLE"
-            baseline_v_str = " {:.3f} +- {:.1f}% {}".format(
-                baseline_v, baseline_pct_change, stamp_b
+            baseline_v_str = " {:.0f} +- {:.1f}% {} ({} datapoints)".format(
+                baseline_v, baseline_pct_change, stamp_b, baseline_nsamples
             )
             stamp_c = ""
             if comparison_pct_change > 10.0:
                 stamp_c = "UNSTABLE"
-            comparison_v_str = " {:.3f} +- {:.1f}% {}".format(
-                comparison_v, comparison_pct_change, stamp_c
+            comparison_v_str = " {:.0f} +- {:.1f}% {} ({} datapoints)".format(
+                comparison_v, comparison_pct_change, stamp_c, comparison_nsamples
             )
             if metric_mode == "higher-better":
                 percentage_change = (
@@ -252,6 +279,8 @@ def compare_command_logic(args, project_name, project_version):
                     note = note + " REGRESSION"
                 elif percentage_change < -noise_waterline:
                     note = note + " potential REGRESSION"
+                else:
+                    note = note + " -- no change --"
                 detected_regressions.append(test_name)
             if percentage_change > 0.0 and not unstable:
                 if percentage_change > waterline:
@@ -260,6 +289,8 @@ def compare_command_logic(args, project_name, project_version):
                     note = note + " IMPROVEMENT"
                 elif percentage_change > noise_waterline:
                     note = note + " potential IMPROVEMENT"
+                else:
+                    note = note + " -- no change --"
 
             if (
                 detected_improvement is False