Skip to content

Commit 50992dd

Browse files
committed
Bug 1942860 - Add prototype for telemetry alerting to perf_sheriff task.
This patch adds the ability to produce telemetry alerts. It runs through the perf_sheriff task in newrelic. This is a prototype so all of it is hidden behind a catch-all try/except so it doesn't impact any existing capabilities (this include the imports). The alerts are detected through mozdetect. They are then added into the PerformanceTelemetryAlert, and PerformanceTelemetryAlertSummary models, and associated to specific mozilla-central pushes.
1 parent ba78a6a commit 50992dd

File tree

8 files changed

+643
-29
lines changed

8 files changed

+643
-29
lines changed

docker-compose.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ services:
3131
- BUGZILLA_API_URL=${BUGZILLA_API_URL:-}
3232
- BUG_FILER_API_KEY=${BUG_FILER_API_KEY:-}
3333
- TLS_CERT_PATH=${TLS_CERT_PATH:-}
34+
- GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS:+/bq-credentials/credentials.json}
3435
entrypoint: './docker/entrypoint.sh'
3536
# We *ONLY* initialize the data when we're running the backend
3637
command: './initialize_data.sh ./manage.py runserver 0.0.0.0:8000'
@@ -40,6 +41,8 @@ services:
4041
shm_size: 2g # 2 Gig seems like a good size
4142
volumes:
4243
- .:/app
44+
- ${GCLOUD_DIR:-.}:/home/.config/gcloud
45+
- ${GOOGLE_APPLICATION_CREDENTIALS:-.}:/bq-credentials/credentials.json
4346
ports:
4447
- '8000:8000'
4548
depends_on:

docker/entrypoint.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ function check_service () {
1616
echo "-----> $name service is available"
1717
}
1818

19+
if [ -z "${GOOGLE_APPLICATION_CREDENTIALS+}" ]; then
20+
unset GOOGLE_APPLICATION_CREDENTIALS
21+
fi
22+
1923
# Keep these in sync with DATABASE_URL.
2024
echo "Checking database status at $DATABASE_URL"
2125
if [[ ${DATABASE_URL:0:27} == *"@host.docker.internal"* ]]; then

requirements/common.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,6 @@ statsd==4.0.1
4949

5050
#installed for OpenAPI schema support
5151
inflection==0.5.1
52+
53+
# Change detection tooling
54+
mozdetect==0.0.8

requirements/common.txt

Lines changed: 353 additions & 26 deletions
Large diffs are not rendered by default.

treeherder/perf/auto_perf_sheriffing/sherlock.py

Lines changed: 266 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
1+
import importlib.util
12
import logging
2-
from datetime import datetime, timedelta
3-
from json import JSONDecodeError
3+
import os
4+
import requests
5+
import shutil
6+
import tempfile
7+
import traceback
8+
import yaml
9+
from datetime import datetime, timedelta, timezone
10+
from json import loads, JSONDecodeError
411
from logging import INFO, WARNING
12+
from time import strptime
13+
from pathlib import Path
514

615
from django.conf import settings
716
from django.db.models import QuerySet
@@ -17,13 +26,33 @@
1726
BackfillNotificationRecord,
1827
BackfillRecord,
1928
BackfillReport,
29+
PerformanceFramework,
30+
PerformanceTelemetrySignature,
31+
PerformanceTelemetryAlert,
32+
PerformanceTelemetryAlertSummary,
33+
Push,
34+
Repository,
2035
)
2136

2237
logger = logging.getLogger(__name__)
2338

2439
CLIENT_ID = settings.PERF_SHERIFF_BOT_CLIENT_ID
2540
ACCESS_TOKEN = settings.PERF_SHERIFF_BOT_ACCESS_TOKEN
2641

42+
BUILDID_MAPPING = "https://hg.mozilla.org/mozilla-central/json-firefoxreleases"
43+
REVISION_INFO = "https://hg.mozilla.org/mozilla-central/json-log/%s"
44+
45+
INITIAL_PROBES = (
46+
"memory_ghost_windows",
47+
"cycle_collector_time",
48+
"mouseup_followed_by_click_present_latency",
49+
"network_tcp_connection",
50+
"network_tls_handshake",
51+
"networking_http_channel_page_open_to_first_sent",
52+
"performance_pageload_fcp",
53+
"perf_largest_contentful_paint",
54+
)
55+
2756

2857
class Sherlock:
2958
"""
@@ -49,6 +78,7 @@ def __init__(
4978

5079
self.supported_platforms = supported_platforms or settings.SUPPORTED_PLATFORMS
5180
self._wake_up_time = datetime.now()
81+
self._buildid_mappings = {}
5282

5383
def sheriff(self, since: datetime, frameworks: list[str], repositories: list[str]):
5484
logger.info("Sherlock: Validating settings...")
@@ -215,3 +245,237 @@ def __get_data_points_to_backfill(context: list[dict]) -> list[dict]:
215245
start = 1
216246

217247
return context[start:]
248+
249+
def telemetry_alert(self):
250+
import mozdetect
251+
from mozdetect.telemetry_query import get_metric_table
252+
253+
if (
254+
not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
255+
and settings.SITE_HOSTNAME != "backend"
256+
):
257+
raise Exception(
258+
"GOOGLE_APPLICATION_CREDENTIALS must be defined in production. "
259+
"Use GCLOUD_DIR for local testing."
260+
)
261+
262+
ts_detectors = mozdetect.get_timeseries_detectors()
263+
264+
metric_definitions = self._get_metric_definitions()
265+
266+
repository = Repository.objects.get(name="mozilla-central")
267+
framework = PerformanceFramework.objects.get(name="telemetry")
268+
for metric_info in metric_definitions:
269+
if metric_info["name"] not in INITIAL_PROBES:
270+
continue
271+
logger.info(f"Running detection for {metric_info['name']}")
272+
cdf_ts_detector = ts_detectors[
273+
metric_info["data"]
274+
.get("monitor", {})
275+
.get("change-detection-technique", "cdf_squared")
276+
]
277+
278+
for platform in ("Windows", "Darwin", "Linux"):
279+
if metric_info["platform"] == "mobile" and platform != "Mobile":
280+
continue
281+
elif metric_info["platform"] == "desktop" and platform == "Mobile":
282+
continue
283+
logger.info(f"On Platform {platform}")
284+
try:
285+
data = get_metric_table(
286+
metric_info["name"],
287+
platform,
288+
android=(platform == "Mobile"),
289+
use_fog=True,
290+
)
291+
if data.empty:
292+
logger.info("No data found")
293+
continue
294+
295+
timeseries = mozdetect.TelemetryTimeSeries(data)
296+
297+
ts_detector = cdf_ts_detector(timeseries)
298+
detections = ts_detector.detect_changes()
299+
300+
for detection in detections:
301+
# Only get buildids if there might be a detection
302+
if not self._buildid_mappings:
303+
self._make_buildid_to_date_mapping()
304+
self._create_detection_alert(
305+
detection, metric_info, platform, repository, framework
306+
)
307+
except Exception:
308+
logger.info(f"Failed: {traceback.format_exc()}")
309+
310+
def _create_detection_alert(
311+
self,
312+
detection: Detection,
313+
probe_info: dict,
314+
platform: str,
315+
repository: Repository,
316+
framework: PerformanceFramework,
317+
):
318+
# Get, or create the signature
319+
# TODO: Allow multiple channels, legacy probes, and different apps
320+
probe_signature, _ = PerformanceTelemetrySignature.objects.update_or_create(
321+
channel="Nightly",
322+
platform=platform,
323+
probe=probe_info["name"],
324+
probe_type="Glean",
325+
application="Firefox",
326+
)
327+
328+
detection_date = str(detection.location)
329+
if detection_date not in self._buildid_mappings[platform]:
330+
# TODO: See if we should expand the range in this situation
331+
detection_date = self._find_closest_build_date(detection_date, platform)
332+
333+
detection_build = self._buildid_mappings[platform][detection_date]
334+
prev_build = self._buildid_mappings[platform][detection_build["prev_build"]]
335+
next_build = self._buildid_mappings[platform][detection_build["next_build"]]
336+
337+
# Get the pushes for these builds
338+
detection_push = Push.objects.get(
339+
revision=detection_build["node"], repository__name=repository.name
340+
)
341+
prev_push = Push.objects.get(revision=prev_build["node"], repository__name=repository.name)
342+
next_push = Push.objects.get(revision=next_build["node"], repository__name=repository.name)
343+
344+
# Check that an alert summary doesn't already exist around this point (+/- 1 day)
345+
latest_timestamp = next_push.time + timedelta(days=1)
346+
oldest_timestamp = next_push.time - timedelta(days=1)
347+
try:
348+
detection_summary = PerformanceTelemetryAlertSummary.objects.filter(
349+
repository=repository,
350+
framework=framework,
351+
push__time__gte=oldest_timestamp,
352+
push__time__lte=latest_timestamp,
353+
).latest("push__time")
354+
except PerformanceTelemetryAlertSummary.DoesNotExist:
355+
detection_summary = None
356+
357+
if not detection_summary:
358+
# Create an alert summary to capture all alerts
359+
# that occurred on the same date range
360+
detection_summary, _ = PerformanceTelemetryAlertSummary.objects.get_or_create(
361+
repository=repository,
362+
framework=framework,
363+
prev_push=prev_push,
364+
push=next_push,
365+
original_push=detection_push,
366+
defaults={
367+
"manually_created": False,
368+
"created": datetime.now(timezone.utc),
369+
},
370+
)
371+
372+
detection_alert, _ = PerformanceTelemetryAlert.objects.update_or_create(
373+
summary_id=detection_summary.id,
374+
series_signature=probe_signature,
375+
defaults={
376+
"is_regression": True,
377+
"amount_pct": round(
378+
(100.0 * abs(detection.new_value - detection.previous_value))
379+
/ float(detection.previous_value),
380+
2,
381+
),
382+
"amount_abs": abs(detection.new_value - detection.previous_value),
383+
"sustained": True,
384+
"direction": detection.direction,
385+
"confidence": detection.confidence,
386+
"prev_value": detection.previous_value,
387+
"new_value": detection.new_value,
388+
"prev_median": detection.optional_detection_info["Interpolated Median"][0],
389+
"new_median": detection.optional_detection_info["Interpolated Median"][1],
390+
"prev_p90": detection.optional_detection_info["Interpolated p05"][0],
391+
"new_p90": detection.optional_detection_info["Interpolated p05"][1],
392+
"prev_p95": detection.optional_detection_info["Interpolated p95"][0],
393+
"new_p95": detection.optional_detection_info["Interpolated p95"][1],
394+
},
395+
)
396+
397+
def _get_metric_definitions(self) -> list[dict]:
398+
metric_definition_urls = [
399+
("https://dictionary.telemetry.mozilla.org/data/firefox_desktop/index.json", "desktop"),
400+
("https://dictionary.telemetry.mozilla.org/data/fenix/index.json", "mobile"),
401+
]
402+
403+
merged_metrics = []
404+
405+
for url, platform in metric_definition_urls:
406+
try:
407+
logger.info(f"Getting probes from {url}")
408+
response = requests.get(url)
409+
response.raise_for_status()
410+
411+
data = response.json()
412+
metrics = data.get("metrics", [])
413+
for metric in metrics:
414+
merged_metrics.append(
415+
{
416+
"name": metric["name"].replace(".", "_"),
417+
"data": metric,
418+
"platform": platform,
419+
}
420+
)
421+
422+
logger.info(f"Found {len(metrics)} probes")
423+
except requests.RequestException as e:
424+
logger.info(f"Failed to fetch from {url}: {e}")
425+
except ValueError:
426+
logger.info(f"Invalid JSON from {url}")
427+
428+
return merged_metrics
429+
430+
def _make_buildid_to_date_mapping(self):
431+
# Always returned in order of newest to oldest, only capture
432+
# the newest build for each day, and ignore others. This can
433+
# differ between platforms too (e.g. failed builds)
434+
buildid_mappings = self._get_buildid_mappings()
435+
436+
prev_date = {}
437+
for build in buildid_mappings["builds"]:
438+
platform = self._replace_platform_build_name(build["platform"])
439+
if not platform:
440+
continue
441+
curr_date = str(datetime.strptime(build["buildid"][:8], "%Y%m%d").date())
442+
443+
platform_builds = self._buildid_mappings.setdefault(platform, {})
444+
if curr_date not in platform_builds:
445+
platform_builds[curr_date] = build
446+
447+
if prev_date.get(platform):
448+
platform_builds[prev_date[platform]]["prev_build"] = curr_date
449+
platform_builds[curr_date]["next_build"] = prev_date[platform]
450+
else:
451+
platform_builds[curr_date]["next_build"] = curr_date
452+
453+
prev_date[platform] = curr_date
454+
455+
def _get_buildid_mappings(self) -> dict:
456+
try:
457+
response = requests.get(BUILDID_MAPPING)
458+
response.raise_for_status()
459+
return loads(response.content)
460+
except requests.RequestException as e:
461+
raise Exception(f"Failed to download buildid mappings, cannot produce detections: {e}")
462+
463+
def _replace_platform_build_name(self, platform: str) -> str:
464+
if platform == "win64":
465+
return "Windows"
466+
if platform == "linux64":
467+
return "Linux"
468+
if platform == "mac":
469+
return "Darwin"
470+
return ""
471+
472+
def _find_closest_build_date(self, detection_date: str, platform: str) -> str:
473+
# Get the closest date to the detection date
474+
prev_date = None
475+
476+
for date in sorted(list(self._buildid_mappings[platform].keys())):
477+
if date > detection_date:
478+
break
479+
prev_date = date
480+
481+
return prev_date

treeherder/perf/fixtures/performance_framework.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,5 +94,13 @@
9494
"name": "fxrecord",
9595
"enabled": true
9696
}
97+
},
98+
{
99+
"pk": 17,
100+
"model": "perf.PerformanceFramework",
101+
"fields": {
102+
"name": "telemetry",
103+
"enabled": true
104+
}
97105
}
98106
]

treeherder/perf/management/commands/perf_sheriff.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import traceback
23
from datetime import datetime, timedelta
34

45
from django.core.management.base import BaseCommand
@@ -59,6 +60,10 @@ def handle(self, *args, **options):
5960
sherlock = sherlock_factory(days_to_lookup)
6061
try:
6162
sherlock.sheriff(since, frameworks, repositories)
63+
try:
64+
sherlock.telemetry_alert()
65+
except Exception as e:
66+
logging.warning("Failed to run telemetry alerting\n" + traceback.format_exc())
6267
except MaxRuntimeExceededError as ex:
6368
logging.info(ex)
6469

treeherder/perf/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ class PerformanceTelemetryAlertSummary(PerformanceAlertSummaryBase):
497497
)
498498

499499
def autodetermine_status(self, alert_model=None):
500-
return super().autodetermine_status(alert_model=PerformanceTelemetryAlertSummary)
500+
return super().autodetermine_status(alert_model=PerformanceTelemetryAlert)
501501

502502
class Meta:
503503
db_table = "performance_telemetry_alert_summary"

0 commit comments

Comments
 (0)