Skip to content

Bug 1964128 - Enable monitor-only alerting for performance tests with email notifications. #8670

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions schemas/performance-artifact.json
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,21 @@
"type": "integer",
"minimum": 1,
"maximum": 255
},
"monitor": {
"description": "Enable non-sheriffed alerting (ignores shouldAlert settings)",
"title": "Monitor",
"type": "boolean"
},
"alertNotifyEmails": {
"type": "array",
"title": "Notify these emails for any alerts produced",
"items": {
"type": "string",
"maxLength": 100
},
"uniqueItems": true,
"maxItems": 8
}
},
"required": ["name", "subtests"],
Expand Down
7 changes: 7 additions & 0 deletions tests/etl/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,10 @@ def perf_job(perf_push, failure_classifications, generic_reference_data):
return create_generic_job(
"myfunguid", perf_push.repository, perf_push.id, generic_reference_data
)


@pytest.fixture
def perf_job_nonsheriffed(perf_push, failure_classifications, generic_reference_data):
return create_generic_job(
"myfunguid", perf_push.repository, perf_push.id, generic_reference_data, tier=3
)
47 changes: 47 additions & 0 deletions tests/etl/test_perf_data_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import operator
import time
from unittest import mock

import pytest
from django.core.management import call_command
Expand Down Expand Up @@ -234,6 +235,52 @@ def test_default_ingest_workflow(
_verify_datum(suite["name"], subtest["name"], subtest["value"], perf_push.time)


@mock.patch("treeherder.etl.perf.generate_alerts")
def test_monitored_perf_data(
mocked_generate_alerts,
test_repository,
perf_push,
later_perf_push,
perf_job_nonsheriffed,
generic_reference_data,
sample_perf_artifact,
):
for suite in sample_perf_artifact["blob"]["suites"]:
suite["monitor"] = True
perf_datum, submit_datum = _prepare_test_data(sample_perf_artifact)

store_performance_artifact(perf_job_nonsheriffed, submit_datum)

assert DATA_PER_ARTIFACT == PerformanceSignature.objects.all().count()
assert 1 == PerformanceFramework.objects.all().count()

# Ensure that alert generation gets triggered on a tier-3 test with
# monitor set to True
mocked_generate_alerts.apply_async.assert_called()


@mock.patch("treeherder.etl.perf.generate_alerts")
def test_unmonitored_perf_data(
mocked_generate_alerts,
test_repository,
perf_push,
later_perf_push,
perf_job_nonsheriffed,
generic_reference_data,
sample_perf_artifact,
):
perf_datum, submit_datum = _prepare_test_data(sample_perf_artifact)

store_performance_artifact(perf_job_nonsheriffed, submit_datum)

assert DATA_PER_ARTIFACT == PerformanceSignature.objects.all().count()
assert 1 == PerformanceFramework.objects.all().count()

# Ensure that alert generation does not get triggered when monitor is False
# on a tier-3 job
mocked_generate_alerts.apply_async.assert_not_called()


def test_hash_remains_unchanged_for_default_ingestion_workflow(
test_repository, perf_job, sample_perf_artifact
):
Expand Down
99 changes: 99 additions & 0 deletions tests/perfalert/test_alerts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import time
from unittest import mock

import pytest

Expand Down Expand Up @@ -341,3 +342,101 @@ def test_alert_change_type_absolute(

assert PerformanceAlert.objects.count() == expected_num_alerts
assert PerformanceAlertSummary.objects.count() == expected_num_alerts


def test_alert_monitor_no_sheriff(
test_repository,
test_issue_tracker,
failure_classifications,
generic_reference_data,
test_perf_signature,
):
# modify the test signature to have it as a monitored signature, but not sheriffed
test_perf_signature.monitor = True
test_perf_signature.should_alert = True
test_perf_signature.save()

base_time = time.time() # generate it based off current time
interval = 60
_generate_performance_data(
test_repository,
test_perf_signature,
base_time,
1,
0.5,
int(interval / 2),
)
_generate_performance_data(
test_repository,
test_perf_signature,
base_time,
int(interval / 2) + 1,
1.0,
int(interval / 2),
)

generate_new_alerts_in_series(test_perf_signature)

assert PerformanceAlert.objects.count() == 1
assert PerformanceAlertSummary.objects.count() == 1

# When monitor is true, then alert should not be sheriffed
# regardless of should_alert settings
assert [not alert.sheriffed for alert in PerformanceAlert.objects.all()]


@mock.patch("treeherder.perf.alerts.taskcluster")
def test_alert_emails(
mocked_taskcluster,
test_repository,
test_issue_tracker,
failure_classifications,
generic_reference_data,
test_perf_signature,
):
mocked_email_client = mock.MagicMock()
mocked_taskcluster.notify_client_factory.return_value = mocked_email_client

emails = "[email protected] [email protected]"
test_perf_signature.alert_notify_emails = emails
test_perf_signature.save()

base_time = time.time() # generate it based off current time
interval = 60
_generate_performance_data(
test_repository,
test_perf_signature,
base_time,
1,
0.5,
int(interval / 2),
)
_generate_performance_data(
test_repository,
test_perf_signature,
base_time,
int(interval / 2) + 1,
1.0,
int(interval / 2),
)

generate_new_alerts_in_series(test_perf_signature)

assert PerformanceAlert.objects.count() == 1
assert PerformanceAlertSummary.objects.count() == 1

# When monitor is False, then the alerts should be sheriffed
assert [alert.sheriffed for alert in PerformanceAlert.objects.all()]

# Make sure the email service was called correctly for 2 emails
assert mocked_taskcluster.notify_client_factory.call_count == 1
assert mocked_email_client.email.call_count == 2

# Ensure that each email specified has an email sent to it
for email in emails.split():
assert any(
[
email in call_arg[0][0]["address"]
for call_arg in mocked_email_client.email.call_args_list
]
)
8 changes: 6 additions & 2 deletions treeherder/etl/perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def _suite_should_alert_based_on(
and new_datum_ingested
and job.repository.performance_alerts_enabled
and job.tier_is_sheriffable
)
) or (signature.monitor and job.repository.name not in ("try",))


def _test_should_alert_based_on(
Expand All @@ -114,7 +114,7 @@ def _test_should_alert_based_on(
and new_datum_ingested
and job.repository.performance_alerts_enabled
and job.tier_is_sheriffable
)
) or (signature.monitor and job.repository.name not in ("try",))


def _test_should_gather_replicates_based_on(
Expand Down Expand Up @@ -205,6 +205,8 @@ def _load_perf_datum(job: Job, perf_datum: dict):
# these properties below can be either True, False, or null
# (None). Null indicates no preference has been set.
"should_alert": suite.get("shouldAlert"),
"monitor": suite.get("monitor"),
"alert_notify_emails": _order_and_concat(suite.get("alertNotifyEmails", [])),
"alert_change_type": PerformanceSignature._get_alert_change_type(
suite.get("alertChangeType")
),
Expand Down Expand Up @@ -272,6 +274,8 @@ def _load_perf_datum(job: Job, perf_datum: dict):
# null (None). Null indicates no preference has been
# set.
"should_alert": subtest.get("shouldAlert"),
"monitor": suite.get("monitor"),
"alert_notify_emails": _order_and_concat(suite.get("alertNotifyEmails", [])),
"alert_change_type": PerformanceSignature._get_alert_change_type(
subtest.get("alertChangeType")
),
Expand Down
19 changes: 18 additions & 1 deletion treeherder/perf/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,29 @@
from django.conf import settings
from django.db import transaction

from treeherder.perf.email import AlertNotificationWriter
from treeherder.perf.models import (
PerformanceAlert,
PerformanceAlertSummary,
PerformanceDatum,
PerformanceSignature,
)
from treeherder.perfalert.perfalert import RevisionDatum, detect_changes
from treeherder.services import taskcluster

logger = logging.getLogger(__name__)


def send_alert_emails(emails, alert, alert_summary):
notify_client = taskcluster.notify_client_factory()

for email in emails:
logger.info(f"Sending alert email to {email}")
notification_writer = AlertNotificationWriter()
email = notification_writer.prepare_new_email(email, alert, alert_summary)
notify_client.email(email)


def geomean(iterable):
# Returns a geomean of a list of values.
a = np.array(iterable)
Expand Down Expand Up @@ -138,6 +150,7 @@ def generate_new_alerts_in_series(signature):
framework=signature.framework,
push_id=cur.push_id,
prev_push_id=prev.push_id,
sheriffed=not signature.monitor,
defaults={
"manually_created": False,
"created": datetime.utcfromtimestamp(cur.push_timestamp),
Expand All @@ -150,9 +163,10 @@ def generate_new_alerts_in_series(signature):
if t_value == float("inf"):
t_value = 1000

PerformanceAlert.objects.update_or_create(
alert, _ = PerformanceAlert.objects.update_or_create(
summary=summary,
series_signature=signature,
sheriffed=not signature.monitor,
defaults={
"noise_profile": noise_profile,
"is_regression": alert_properties.is_regression,
Expand All @@ -163,3 +177,6 @@ def generate_new_alerts_in_series(signature):
"t_value": t_value,
},
)

if signature.alert_notify_emails:
send_alert_emails(signature.alert_notify_emails.split(), alert, summary)
44 changes: 44 additions & 0 deletions treeherder/perf/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,47 @@ def _write_content(self, must_mention: list[PerformanceSignature]):
content.include_signatures(must_mention)

self._email.content = str(content)


class AlertNotificationContent:
DESCRIPTION = "Perfherder has detected a performance change on the test **{test}** and produced an alert. See {alert_summary} for more information."
ALERT_SUMMARY_LINK = "https://treeherder.mozilla.org/perfherder/alerts?id={alert_summary_id}"

def __init__(self):
self._raw_content = None

def include_alert_summary(self, test: str, alert_summary_id: str):
self._raw_content = self.DESCRIPTION.format(
test=test,
alert_summary=(
f"[this alert summary]"
f"({self.ALERT_SUMMARY_LINK.format(alert_summary_id=alert_summary_id)})"
),
)

def __str__(self):
if self._raw_content is None:
raise ValueError("No content set")
return self._raw_content


class AlertNotificationWriter(EmailWriter):
def prepare_new_email(
self, email: str, alert: PerformanceAlert, alert_summary: PerformanceAlertSummary
) -> dict:
self._write_address(email)
self._write_subject(alert.series_signature.test)
self._write_content(alert.series_signature.test, str(alert_summary.id))
return self.email

def _write_address(self, email: str):
self._email.address = email

def _write_subject(self, test: str):
self._email.subject = f"Alert: Test {test} detected a performance change"

def _write_content(self, test: str, alert_summary_id: str):
content = AlertNotificationContent()
content.include_alert_summary(test, alert_summary_id)

self._email.content = str(content)
Loading