1
- from treeherder . model . models import Group , GroupStatus , Job
1
+ import logging
2
2
3
+ from treeherder .model .models import Group , GroupStatus , Job , Push
4
+
5
+ logger = logging .getLogger (__name__ )
6
+
7
+ # TODO: test
8
+ # - p1:t1: fail on g1, p2:t1: pass on g1 - result: p1:t1: intermittent
9
+ # - p1:t1: fail on leak (all groups pass), p2:t1: pass - result p1:t1: still default
10
+ # - p1:t1: fail on g1, p1:t1.2: pass on g1 - result p1:t1: intermittent
11
+ # - p1:t1: fail on g1, p1:t1-cf: pass on g1 - result p1:t1: intermittent
12
+ # - p1:t1: fail on g1, p1:t1-cf: fail on g1 - result p1:t1: still default
3
13
4
14
def check_and_mark_intermittent (job_id ):
5
15
current_job = Job .objects .get (id = job_id )
6
16
7
- if current_job .job_type .name .endswith ("-cf" ):
8
- jtname = [current_job .job_type .name , current_job .job_type .name .strip ("-cf" )]
9
- else :
10
- jtname = [current_job .job_type .name , f"{ current_job .job_type .name } -cf" ]
17
+ jtname = current_job .job_type .name .strip ('-cf' )
18
+ ids = [current_job .push .id ]
19
+
20
+ try :
21
+ _ = int (jtname .split ('-' )[- 1 ])
22
+ jtname = '-' .join (jtname .split ('-' )[:- 1 ])
23
+ except :
24
+ pass
25
+
26
+ # if we are not on try, look at recent history
27
+ if current_job .repository .id != 4 :
28
+ # get list of pushes
29
+ ids = Push .objects .filter (
30
+ repository__id = current_job .repository .id
31
+ ).values (
32
+ "id"
33
+ )[:20 ]
11
34
12
35
all_groups = Group .objects .filter (
13
- job_logs__job__push__id = current_job .push .id ,
14
- job_logs__job__job_type__name__in = jtname ,
36
+ job_logs__job__push__id__in = ids ,
37
+ job_logs__job__push__repository__id = current_job .repository .id ,
38
+ job_logs__job__job_type__name__startswith = jtname ,
39
+ job_logs__job__failure_classification__id__in = [1 ,4 ,6 ], # not classified, intermittent, new_failure; TODO: consider 7 == autoclassified
40
+ job_logs__job__result__in = ["success" , "testfailed" ], # primarily ignore retry/usercancel
15
41
group_result__status__in = [GroupStatus .OK , GroupStatus .ERROR ],
16
42
).values (
17
43
"name" ,
18
44
"job_logs__job__id" ,
19
45
"group_result__status" ,
20
- )
46
+ "job_logs__job__job_type__name" ,
47
+ "job_logs__job__push__id"
48
+ ).order_by ("-job_logs__job__push__time" )
21
49
22
- groups = {}
23
- jobs = {}
50
+ mappings = {}
24
51
for item in all_groups :
52
+ jobname = item ["job_logs__job__job_type__name" ].strip ('-cf' )
53
+ try :
54
+ int (jobname .split ('-' )[- 1 ])
55
+ jobname = '-' .join (jobname .split ('-' )[:- 1 ])
56
+ except :
57
+ pass
58
+
59
+ if jobname != jtname :
60
+ # we have a variant
61
+ continue
62
+
63
+ if item ["job_logs__job__push__id" ] not in mappings :
64
+ mappings [item ["job_logs__job__push__id" ]] = {
65
+ "groups" : {},
66
+ "jobs" : {}
67
+ }
68
+ groups = mappings [item ["job_logs__job__push__id" ]]["groups" ]
69
+ jobs = mappings [item ["job_logs__job__push__id" ]]["jobs" ]
70
+
25
71
if item ["name" ] not in groups :
26
72
groups [item ["name" ]] = {}
27
73
if item ["job_logs__job__id" ] not in groups [item ["name" ]]:
@@ -32,24 +78,44 @@ def check_and_mark_intermittent(job_id):
32
78
if item ["name" ] not in jobs [item ["job_logs__job__id" ]]:
33
79
jobs [item ["job_logs__job__id" ]][item ["name" ]] = item ["group_result__status" ]
34
80
35
- if len (jobs .keys ()) <= 1 :
36
- # zero jobs == no groups reported (i.e. marionette)
37
- # 1 job == no additional data
38
- return
39
-
40
- for job in jobs .keys ():
41
- # for each similar task.label, ensure all groups have >=50% pass rate, if so flag failing
42
- # job as intermittent. for non test failures, ensure all groups are green
43
- all_green = True
44
- failed_groups = [g for g in jobs [job ] if int (jobs [job ][g ]) == GroupStatus .ERROR ]
45
- for group in failed_groups :
46
- all_status = [groups [group ][j ] for j in groups [group ]]
47
- pass_rate = len ([s for s in all_status if s == GroupStatus .OK ]) / len (all_status )
48
- if pass_rate < 0.5 :
49
- all_green = False
50
- break
51
-
52
- target_job = Job .objects .filter (id = job )
53
-
54
- if all_green and target_job [0 ].result != "success" :
55
- target_job .update (failure_classification_id = 4 )
81
+ # multi push support - want to look back in history now that we have "future" data
82
+ # a previous job can only change if ALL failing groups have future passing data
83
+ #
84
+ # current job has new data, lets find all groups that changed status as a result of new data
85
+ # TODO: handle new regressions - historical rate might be broken, then we need to wait for more future data
86
+ changed_groups = {}
87
+ for group in mappings [current_job .push .id ]["groups" ]:
88
+ all_data = []
89
+ for id in mappings .keys ():
90
+ all_data .extend ([mappings [id ]["groups" ][group ][j ] for j in mappings [id ]["groups" ].get (group , {})])
91
+
92
+ # if new data changes results, update
93
+ pass_rate = len ([s for s in all_data if s == GroupStatus .OK ]) / len (all_data )
94
+ if pass_rate >= 0.5 :
95
+ changed_groups [group ] = True
96
+
97
+ # all changed_groups need to be evaluated on previous 'failed' jobs to ensure all groups in that task are 'passing'
98
+ for id in mappings .keys ():
99
+ if id == current_job .push .id and len (ids ) > 1 :
100
+ continue
101
+
102
+ for job in mappings [id ]["jobs" ]:
103
+ if job == job_id :
104
+ # current job will need future data to turn green
105
+ continue
106
+
107
+ all_green = True
108
+ for group in mappings [id ]["jobs" ][job ]:
109
+ # if group changed to failing and group originally failed
110
+ if mappings [id ]["groups" ][group ][job ] == GroupStatus .ERROR and group not in changed_groups :
111
+ all_green = False
112
+
113
+ if all_green :
114
+ target_job = Job .objects .filter (id = job )
115
+
116
+ # edge case is all groups originally pass and then shutdown leaks cause 'testfailed'.
117
+ # also we ignore infra/leaks that don't report group failures in errorsummary files
118
+ if target_job [0 ].result != "success" and target_job [0 ].failure_classification_id != 4 :
119
+ logger .info ("JMAHER: updating classification to 4 for job id: " % target_job )
120
+ target_job .update (failure_classification_id = 4 )
121
+
0 commit comments