1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
content / test / gpu / bad_machine_finder / detection.py [blame]
# Copyright 2024 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Code for detecting bad machines from Swarming task data."""
import collections
import decimal
import functools
import logging
import math
import statistics
from typing import Generator, Iterable, Optional, Set, Tuple
from bad_machine_finder import tasks
class BadMachineList:
"""Stores bad Swarming bots and the reasons for why they were deemed bad."""
def __init__(self):
self.bad_machines = collections.defaultdict(list)
def AddBadMachine(self, bot_id: str, reason: str) -> None:
"""Adds a bad machine to the list with the given reason.
Args:
bot_id: The bad machine name
reason: The reason why the machine was deemed bad
"""
self.bad_machines[bot_id].append(reason)
def Merge(self, other: 'BadMachineList') -> None:
"""Merges another BadMachineList into this one.
Args:
other: The BadMachineList to get data from.
"""
for bot_id, reasons in other.bad_machines.items():
self.bad_machines[bot_id].extend(reasons)
def RemoveLowConfidenceMachines(self, num_detections: int) -> None:
"""Removes machines that weren't flagged at least |num_detections| times.
Args:
num_detections: The minimum number times a machine needs to be flagged as
bad in order to be kept.
"""
trimmed_machines = collections.defaultdict(list)
for bot_id, reasons in self.bad_machines.items():
if len(reasons) >= num_detections:
trimmed_machines[bot_id] = reasons
else:
logging.debug(
'Bot %s removed because it was only flagged by %d detection '
'method(s)', bot_id, len(reasons))
self.bad_machines = trimmed_machines
def IterMarkdown(self) -> Generator[Tuple[str, str], None, None]:
"""Iterates over contents, producing Markdown for each machine.
Yields:
A tuple (bot_id, markdown). |bot_id| is the ID of the Swarming bot,
|markdown| is a Markdown string describing why the bot is bad.
"""
# Keep a consistent order.
bot_ids = sorted(list(self.bad_machines.keys()))
for b in bot_ids:
markdown_components = []
markdown_components.append(f' * {b}')
reasons = self.bad_machines[b]
for r in reasons:
markdown_components.append(f' * {r}')
yield b, '\n'.join(markdown_components)
class MixinGroupedBadMachines:
"""Stores BadMachineLists for multiple mixins."""
def __init__(self):
self._bad_machines_by_mixin = {}
def AddMixinData(self, mixin_name: str,
bad_machine_list: 'BadMachineList') -> None:
"""Adds a BadMachineList for |mixin_name|.
Args:
mixin_name: The name of the mixin
bad_machine_list: The BadMachineList for the mixin
"""
if mixin_name in self._bad_machines_by_mixin:
raise ValueError(
f'Bad machines for mixin {mixin_name} were already added')
self._bad_machines_by_mixin[mixin_name] = bad_machine_list
def GetAllBadMachineNames(self) -> Set[str]:
"""Gets all unique bad machine names across all stored mixins.
Returns:
A set containing all bad machine names.
"""
bad_machine_names = set()
for _, bad_machine_list in self._bad_machines_by_mixin.items():
for bot_id in bad_machine_list.bad_machines.keys():
bad_machine_names.add(bot_id)
return bad_machine_names
def GenerateMarkdown(self,
bots_to_skip: Optional[Iterable[str]] = None) -> str:
"""Generates a Markdown string describing the object's contents.
Args:
bots_to_skip: An optional iterable of bot names to not include in the
Markdown content. If not provided, all bots will be included.
Returns:
A Markdown string describing each bad machine for each mixin.
"""
bots_to_skip = bots_to_skip or []
markdown_components = []
# Guarantee a consistent ordering.
for mixin_name in sorted(self._bad_machines_by_mixin.keys()):
bad_machine_list = self._bad_machines_by_mixin[mixin_name]
mixin_report_components = [
f'Bad machines for {mixin_name}',
]
for bot_id, markdown in bad_machine_list.IterMarkdown():
if bot_id in bots_to_skip:
continue
mixin_report_components.append(markdown)
if len(mixin_report_components) > 1:
markdown_components.append('\n'.join(mixin_report_components))
return '\n\n'.join(markdown_components)
def DetectViaStdDevOutlier(mixin_stats: tasks.MixinStats,
stddev_multiplier: float) -> 'BadMachineList':
"""Detects bad machines by looking for machines whose task failure rate is
more than the given number of standard deviations from the fleet-wide mean.
Args:
mixin_stats: A tasks.MixinStats containing the task stats for the hardware
fleet being checked.
stddev_multiplier: A multiplier to apply to the standard deviation. If a
machine's failure rate is greater than (fleet-wide mean) + (stddev *
stddev_multiplier), it is considered bad.
Returns:
A BadMachineList containing any bad machines found.
"""
if mixin_stats.total_tasks <= 0:
raise ValueError('mixin_stats needs to contain data')
if stddev_multiplier < 0:
raise ValueError('stddev_multiplier must be non-negative')
bad_machines = BadMachineList()
failure_rates = mixin_stats.GetOverallFailureRates()
mean = statistics.mean(failure_rates)
stddev = statistics.pstdev(failure_rates)
threshold = mean + stddev_multiplier * stddev
for bot_id, bot_stats in mixin_stats.IterBots():
bot_failure_rate = bot_stats.overall_failure_rate
if bot_failure_rate <= threshold:
continue
reason = (f'Had a failure rate of {bot_failure_rate} despite a fleet-wide '
f'average of {mean} and a standard deviation of {stddev}.')
bad_machines.AddBadMachine(bot_id, reason)
return bad_machines
def DetectViaRandomChance(mixin_stats: tasks.MixinStats,
probability_threshold: float) -> 'BadMachineList':
"""Detects bad machines by looking for cases where it is very unlikely that
a machine got as many failed tasks as it did through random chance. If it is
unlikely that the failures happened due to random chance, then that means that
the machine is bad and contributing to failures.
Args:
mixin_stats: A tasks.MixinStats containing the task stats for the hardware
fleet being checked.
probability_threshold: How unlikely it can be that a machine got its
failures randomly and still be considered good.
Returns:
A BadMachineList containing any bad machines found.
"""
if mixin_stats.total_tasks <= 0:
raise ValueError('mixin_stats needs to contain data')
if probability_threshold <= 0:
raise ValueError('probability_threshold must be positive')
if probability_threshold > 1:
raise ValueError('probability_threshold must be <= 1')
bad_machines = BadMachineList()
average_failure_rate = (decimal.Decimal(mixin_stats.failed_tasks) /
decimal.Decimal(mixin_stats.total_tasks))
for bot_id, bot_stats in mixin_stats.IterBots():
p = _ChanceOfNOrMoreIndependentEvents(average_failure_rate,
bot_stats.total_tasks,
bot_stats.failed_tasks)
if p >= probability_threshold:
continue
reason = (f'{bot_stats.failed_tasks} of {bot_stats.total_tasks} tasks '
f'failed despite a fleet-wide average failed task rate of '
f'{average_failure_rate}. The probability of this happening '
f'randomly is {p}.')
bad_machines.AddBadMachine(bot_id, reason)
return bad_machines
def DetectViaInterquartileRange(mixin_stats: tasks.MixinStats, mixin_name: str,
iqr_multiplier: float) -> 'BadMachineList':
"""Detects bad machines by looking for for bots whose failure rate is above
Q3 + |iqr_multiplier| * IQR, which is a standard way of looking for outliers
in data.
See https://en.wikipedia.org/wiki/Interquartile_range#Outliers.
Args:
mixin_stats: A tasks.MixinStats containing the task stats for the hardware
fleet being checked.
mixin_name: The name of the mixin being checked. For debugging purposes.
iqr_multiplier: How many multiples of the IQR above the third quartile a
failure rate has to be to be considered an outlier.
Returns:
A BadMachineList containing any bad machines found.
"""
if mixin_stats.total_tasks <= 0:
raise ValueError('mixin_stats needs to contain data')
if iqr_multiplier <= 0:
raise ValueError('iqr_multiplier must be positive')
bad_machines = BadMachineList()
failure_rates = mixin_stats.GetOverallFailureRates()
if len(failure_rates) <= 4:
logging.info(
'Quartiles require at least 5 samples to be meaningful. Mixin %s only '
'provided %d samples.', mixin_name, len(failure_rates))
return bad_machines
quartiles = statistics.quantiles(failure_rates, n=4, method='inclusive')
iqr = quartiles[2] - quartiles[0]
if iqr == 0:
logging.info(
'Mixin %s resulted in an IQR of 0, which is not useful for detecting '
'outliers.', mixin_name)
return bad_machines
upper_bound = quartiles[2] + iqr_multiplier * iqr
for bot_id, bot_stats in mixin_stats.IterBots():
bot_failure_rate = bot_stats.overall_failure_rate
if bot_failure_rate <= upper_bound:
continue
reason = (f'Failure rate of {bot_failure_rate} is above the IQR-based '
f'upper bound of {upper_bound}.')
bad_machines.AddBadMachine(bot_id, reason)
return bad_machines
@functools.lru_cache(maxsize=None)
def _ChanceOfNOrMoreIndependentEvents(event_probability: decimal.Decimal,
total_events: int, n: int) -> float:
"""Calculates the probability of getting |n| or more cases of independent
events.
Args:
event_probability: The probability of the event happening.
total_events: The number of times we check whether the event happened or
not.
n: The minimum number of times the event happened.
"""
cumulative_probability = decimal.Decimal(0)
for current_n in range(n, total_events + 1):
cumulative_probability += _ChanceOfExactlyNIndependentEvents(
event_probability, total_events, current_n)
return float(cumulative_probability)
@functools.lru_cache(maxsize=None)
def _ChanceOfExactlyNIndependentEvents(event_probability: decimal.Decimal,
total_events: int,
n: int) -> decimal.Decimal:
"""Calculates the probability of getting exactly |n| cases of independent
events.
Args:
event_probability: The probability of the event happening.
total_events: The number of times we check whether the event happened or
not.
n: The number of times the event happened.
Returns:
A decimal.Decimal object containing the chance that the event happened
exactly |n| times.
"""
# We use decimal.Decimal instead of float since math.comb() can produce
# numbers that are too large to store in a float.
combinations = decimal.Decimal(math.comb(total_events, n))
chance_of_one_permutation = (event_probability**n *
(1 - event_probability)**(total_events - n))
return combinations * chance_of_one_permutation