Skip to content

Commit 68504de

Browse files
committed
[WIP] OSD-28525 - Initial implementation for MachineHealthCheckUnterminatedShortCircuitSRE alert
1 parent c6ad107 commit 68504de

File tree

6 files changed

+399
-4
lines changed

6 files changed

+399
-4
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
/*
2+
machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert
3+
*/
4+
package machinehealthcheckunterminatedshortcircuitsre
5+
6+
import (
7+
"context"
8+
"fmt"
9+
"strings"
10+
"time"
11+
12+
//"time"
13+
14+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
15+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
16+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
17+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
18+
19+
corev1 "k8s.io/api/core/v1"
20+
"sigs.k8s.io/controller-runtime/pkg/client"
21+
22+
//metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
24+
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
25+
)
26+
27+
const (
28+
alertname = "MachineHealthCheckUnterminatedShortCircuitSRE"
29+
// remediationName must match the name of this investigation's directory, so it can be looked up via the backplane-api
30+
remediationName = "machineHealthCheckUnterminatedShortCircuitSRE"
31+
32+
machineNamespace = "openshift-machine-api"
33+
machineRoleLabel = "machine.openshift.io/cluster-api-machine-role"
34+
machineRoleWorker = "worker"
35+
)
36+
37+
type Investigation struct{
38+
kclient client.Client
39+
notes *notewriter.NoteWriter
40+
}
41+
42+
func (i *Investigation) setup(r *investigation.Resources) error {
43+
// Setup investigation
44+
k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
45+
if err != nil {
46+
return fmt.Errorf("failed to initialize kubernetes client: %w", err)
47+
}
48+
i.kclient = k
49+
i.notes = notewriter.New(r.Name, logging.RawLogger)
50+
51+
return nil
52+
}
53+
54+
func (i *Investigation) cleanup(r *investigation.Resources) error {
55+
return k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
56+
}
57+
58+
// Run investigates the MachineHealthCheckUnterminatedShortCircuitSRE alert
59+
func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
60+
result := investigation.InvestigationResult{}
61+
62+
// Setup & teardown
63+
err := i.setup(r)
64+
if err != nil {
65+
return result, fmt.Errorf("failed to setup investigation: %w", err)
66+
}
67+
defer func(r *investigation.Resources) {
68+
err := i.cleanup(r)
69+
if err != nil {
70+
logging.Errorf("failed to cleanup investigation: %w", err)
71+
}
72+
}(r)
73+
74+
// Execute investigation
75+
76+
recommendations := []summary{}
77+
78+
// Examine machines - in addition to broken nodes, machines in the 'Failing' phase are counted against a machinehealthcheck's maxUnhealthy count:
79+
// https://github.com/openshift/machine-api-operator/blob/e4bd10f78bada4cc8b36236e9b0b1c1332e5ef88/pkg/controller/machinehealthcheck/machinehealthcheck_controller.go#L764
80+
failedMachines, err := i.getFailingMachines()
81+
if err != nil {
82+
logging.Errorf("failed to retrieve machines: %w", err)
83+
i.notes.AppendWarning("failed to retrieve machines: %v", err)
84+
}
85+
for _, machine := range failedMachines {
86+
// Confirm only worker machines are failing - if Red Hat-managed machines are affected, forward to Primary
87+
role, err := i.getMachineRole(machine)
88+
if err != nil {
89+
// Failing to determine whether a machine is Red Hat-managed warrants human investigation
90+
logging.Error("failed to determine machine role: %w", err)
91+
i.notes.AppendWarning("failed to determine machine role: %v\nEscalating to Primary", err)
92+
i.notes.AppendWarning("Primary: one or more machines was detected as missing the %q label, which can impact machine-api functionality. Please investigate the issue and take any appropriate action to address this.", machineRoleLabel)
93+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
94+
}
95+
if role != machineRoleWorker {
96+
logging.Error("found non-worker machine in %q state; escalating incident to Primary", err)
97+
i.notes.AppendWarning("found non-worker machine in %q state; escalating incident to Primary", *machine.Status.Phase)
98+
i.notes.AppendWarning("Primary: one or more Red Hat-managed machines was detected to have a .Status.Phase of %q, which can impact SLOs. Please investigate the issue and take any appropriate action to address this.", *machine.Status.Phase)
99+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
100+
}
101+
102+
recommendation, err := i.machineRecommendation(machine)
103+
if err != nil {
104+
logging.Error("failed to make recommendation for node %q: %w", machine.Name, err)
105+
i.notes.AppendWarning("failed to make recommendation for node %q: %v", machine.Name, err)
106+
} else {
107+
recommendations = append(recommendations, recommendation)
108+
}
109+
}
110+
111+
// Examine nodes
112+
notReadyNodes, err := i.getNotReadyNodes()
113+
if err != nil {
114+
logging.Error("failed to retrieve nodes: %w", err)
115+
i.notes.AppendWarning("failed to retrieve nodes: %v", err)
116+
}
117+
for _, node := range notReadyNodes {
118+
if i.nodeMachineRemediated(node, failedMachines) {
119+
// Don't bother double checking nodes whose machine we've already investigated
120+
continue
121+
}
122+
123+
recommendation, err := i.nodeRecommendation(node)
124+
if err != nil {
125+
logging.Errorf("failed to make recommendation for node %q: %w", node.Name, err)
126+
i.notes.AppendWarning("failed to make recommendation for node %q: %v", node.Name, err)
127+
} else {
128+
recommendations = append(recommendations, recommendation)
129+
}
130+
}
131+
132+
recommendationMsg := "the following action(s) are recommended:"
133+
for _, recommendation := range recommendations {
134+
recommendationMsg = fmt.Sprintf("%s\n - %s", recommendationMsg, recommendation)
135+
}
136+
i.notes.AppendWarning(recommendationMsg)
137+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
138+
}
139+
140+
func (i *Investigation) nodeMachineRemediated(node corev1.Node, remediatedMachines []machinev1beta1.Machine) bool {
141+
for _, machine := range remediatedMachines {
142+
if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == node.Name {
143+
return true
144+
}
145+
}
146+
return false
147+
}
148+
149+
func (i *Investigation) getMachineRole(machine machinev1beta1.Machine) (string, error) {
150+
role, found := machine.Labels[machineRoleLabel]
151+
if !found {
152+
return "", fmt.Errorf("expected label '%s' not found", machineRoleLabel)
153+
}
154+
return role, nil
155+
}
156+
157+
func (i *Investigation) getFailingMachines() ([]machinev1beta1.Machine, error) {
158+
machines := &machinev1beta1.MachineList{}
159+
listOptions := &client.ListOptions{Namespace: machineNamespace}
160+
err := i.kclient.List(context.TODO(), machines, listOptions)
161+
if err != nil {
162+
return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machines from cluster: %w", err)
163+
}
164+
165+
failed := []machinev1beta1.Machine{}
166+
for _, machine := range machines.Items {
167+
if *machine.Status.Phase == machinev1beta1.PhaseFailed || machine.Status.ErrorReason != nil {
168+
failed = append(failed, machine)
169+
}
170+
}
171+
return failed, nil
172+
}
173+
174+
// summary provides a simple structure to pair each problem found with a recommended solution
175+
type summary struct {
176+
issue string
177+
recommendation string
178+
}
179+
180+
func (s summary) String() string {
181+
return fmt.Sprintf("issue: %s\nrecommendation: %sn\n", s.issue, s.recommendation)
182+
}
183+
184+
// machineRecommendation determines the recommended course of action for a machine
185+
func (i *Investigation) machineRecommendation(machine machinev1beta1.Machine) (summary, error) {
186+
summary := summary{}
187+
switch *machine.Status.ErrorReason {
188+
case machinev1beta1.IPAddressInvalidReason:
189+
summary.issue = fmt.Sprintf("invalid IP address: %q", *machine.Status.ErrorMessage)
190+
summary.recommendation = fmt.Sprintf("deleting the machine may allow the cloud provider to assign a valid IP address:\n\n oc delete machine -n %s %s", machine.Namespace, machine.Name)
191+
case machinev1beta1.CreateMachineError:
192+
summary.issue = fmt.Sprintf("machine failed to create: %q", *machine.Status.ErrorMessage)
193+
summary.recommendation = fmt.Sprintf("deleteing the machine may bypass any transient issue with the cloud provider:\n\n oc delete machine -n %s %s", machine.Namespace, machine.Name)
194+
case machinev1beta1.InvalidConfigurationMachineError:
195+
summary.issue = fmt.Sprintf("machine configuration is invalid: %q", *machine.Status.ErrorMessage)
196+
summary.recommendation = fmt.Sprintf("check audit history for cluster to determine whether a third-party has modified the machine or its machineset")
197+
default:
198+
summary.issue = "no .Status.ErrorReason found for machine"
199+
summary.recommendation = fmt.Sprintf("manual investigation for machine %s required", machine.Name)
200+
}
201+
return summary, nil
202+
}
203+
204+
func (i *Investigation) getNotReadyNodes() ([]corev1.Node, error) {
205+
nodes := &corev1.NodeList{}
206+
err := i.kclient.List(context.TODO(), nodes)
207+
if err != nil {
208+
return []corev1.Node{}, fmt.Errorf("failed to retrieve nodes from cluster: %w", err)
209+
}
210+
211+
notReady := []corev1.Node{}
212+
for _, node := range nodes.Items {
213+
readyCondition, found := i.findReadyCondition(node)
214+
// Interpret no Ready condition as "unknown", though in reality this shouldn't ever happen
215+
if !found || readyCondition.Status != corev1.ConditionTrue {
216+
notReady = append(notReady, node)
217+
}
218+
}
219+
return notReady, nil
220+
}
221+
222+
func (i *Investigation) findReadyCondition(node corev1.Node) (corev1.NodeCondition, bool) {
223+
for _, condition := range node.Status.Conditions {
224+
if condition.Type == corev1.NodeReady {
225+
return condition, true
226+
}
227+
}
228+
return corev1.NodeCondition{}, false
229+
}
230+
231+
//func (i *Investigation) nodeRecommendation(node corev1.Node) (string, error) {
232+
func (i *Investigation) nodeRecommendation(node corev1.Node) (summary, error) {
233+
// TODO
234+
summary := summary{}
235+
ready, found := i.findReadyCondition(node)
236+
if !found {
237+
summary.issue = "node has no Ready condition set"
238+
summary.recommendation = "manual investigation required to determine why node %q does not contain a Ready .Status.Condition"
239+
return summary, nil
240+
}
241+
242+
lastCheckinElapsed := time.Since(ready.LastHeartbeatTime.Time)
243+
summary.issue = fmt.Sprintf("node %q has been %q for %s", node.Name, ready.Status, lastCheckinElapsed)
244+
summary.recommendation = "manual investigation required"
245+
return summary, nil
246+
}
247+
248+
func (i *Investigation) Name() string {
249+
return alertname
250+
}
251+
252+
func (i *Investigation) Description() string {
253+
return fmt.Sprintf("Investigates '%s' alerts", alertname)
254+
}
255+
256+
func (i *Investigation) IsExperimental() bool {
257+
return true
258+
}
259+
260+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
261+
return strings.Contains(alert, alertname)
262+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Testing MachineHealthCheckUnterminatedShortCircuitSRE
2+
3+
The `MachineHealthCheckUnterminatedShortCircuitSRE` alert is derived from the `MachineHealthCheck` objects in the `openshift-machine-api` namespace on the cluster. Specifically, it is triggered when the number of nodes matching one of the `.spec.unhealthyConditions` meets or exceeds the `.spec.maxUnhealthy` value for a [duration of time](https://github.com/openshift/managed-cluster-config/blob/3338dd375fa6517d7768eca985c3ca115bbc1484/deploy/sre-prometheus/100-machine-health-check-unterminated-short-circuit.PrometheusRule.yaml#L16).
4+
5+
6+
## Setup
7+
8+
Before applying any test configuration, be sure to [pause hive syncsetting](https://github.com/openshift/ops-sop/blob/master/v4/knowledge_base/pause-syncset.md), to avoid having these changes overwritten mid-test.
9+
10+
Then, apply the following changes to the `MachineHealthCheck` object(s) you wish to check against, in order to make testing a little easier:
11+
- Reducing the `.spec.maxUnhealthy` value will lower the number of nodes that need to be "broken" to short-circuit the machine-api-operator and halt remediation
12+
- Reducing the `.spec.unhealthyConditions` timeouts ensures that the machine-api short-circuits much more quickly after modifying the nodes
13+
14+
A patched version of the default `MachineHealthCheck/srep-worker-healthcheck` object is pre-configured [here](./srep-worker-healthcheck_machinehealthcheck.yaml). Use the following command to apply it to your test cluster:
15+
16+
```sh
17+
ocm backplane elevate "testing CAD" -- replace -f ./srep-worker-healthcheck_machinehealthcheck.yaml
18+
```
19+
20+
## Test Cases
21+
22+
Because it has a fairly broad definition, a `MachineHealthCheckUnterminatedShortCircuitSRE` alert could fire as a result of several different scenarios. A few are outlined below, along with methods to reproduce them.
23+
24+
### nodes `NotReady`, machines `Running`
25+
26+
While the `machine-api-operator` owns and operates the `machine` object-type, it's important to note that its `MachineHealthCheck` objects actually utilize the `.status` of the corresponding **node** to determine if a machine is healthy. This is because a machine's status only reflects whether the VM in the cloud provider is running or not, while the node's status indicates whether the instance is a functional part of the cluster. Therefore, it's possible for a `MachineHealthCheckUnterminatedShortCircuitSRE` alert to fire while all `machines` have a `.status.phase` of `Running`.
27+
28+
The simplest way to reproduce this is to login onto the node and stop the `kubelet.service` on multiple nodes at once.
29+
30+
31+
This can be done via the debug command:
32+
33+
```sh
34+
ocm backplane elevate "testing CAD" -- debug node/$NODE
35+
```
36+
37+
inside the container, run:
38+
39+
```sh
40+
chroot /host
41+
systemctl stop kubelet.service
42+
```
43+
44+
This should automatically remove the debug pod. The node status should flip to `NotReady` shortly thereafter.
45+
46+
### Machines in `Failed` phase
47+
48+
Having several machines in a `Failed` state still violates a `MachineHealthCheck`'s `.status.maxUnhealthy`, despite the machines not having any corresponding nodes to check against.
49+
50+
One method to simulate this is to edit the machineset so it contains invalid configurations. The following patch updates all worker machinesets to use the `fakeinstancetype` machine-type, for example:
51+
52+
```sh
53+
ocm backplane elevate "testing CAD" -- patch machinesets $MACHINESET -n openshift-machine-api --type merge -p '{"spec": {"template": {"spec": {"providerSpec": {"value": {"instanceType": "fakeinstancetype"}}}}}}'
54+
oc delete machine -n openshift-machine-api -l machine.openshift.io/cluster-api-machineset=$MACHINESET
55+
```
56+
57+
### Machines with no phase
58+
59+
TODO - does this trigger a healthcheck alert?
60+
61+
Remove operator role from AWS IAM > roles page and see if it triggers the alert
62+
63+
64+
## Additional Resources
65+
The following pages may be useful if the information in this guide is insufficient or has become stale.
66+
67+
- [Machine API Brief](https://github.com/openshift/machine-api-operator/blob/main/docs/user/machine-api-operator-overview.md)
68+
- [Machine API FAQ](https://github.com/openshift/machine-api-operator/blob/main/FAQ.md)
69+
- [MachineHealthCheck documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.17/html/machine_management/deploying-machine-health-checks#machine-health-checks-resource_deploying-machine-health-checks)
70+
- [Alert SOP](https://github.com/openshift/ops-sop/blob/master/v4/alerts/MachineHealthCheckUnterminatedShortCircuitSRE.md)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
apiVersion: machine.openshift.io/v1beta1
2+
kind: MachineHealthCheck
3+
metadata:
4+
name: srep-worker-healthcheck
5+
namespace: openshift-machine-api
6+
spec:
7+
maxUnhealthy: 0
8+
nodeStartupTimeout: 25m
9+
selector:
10+
matchExpressions:
11+
- key: machine.openshift.io/cluster-api-machine-role
12+
operator: NotIn
13+
values:
14+
- infra
15+
- master
16+
- key: machine.openshift.io/cluster-api-machineset
17+
operator: Exists
18+
- key: machine.openshift.io/instance-type
19+
operator: NotIn
20+
values:
21+
- m5.metal
22+
- m5d.metal
23+
- m5n.metal
24+
- m5dn.metal
25+
- m5zn.metal
26+
- m6a.metal
27+
- m6i.metal
28+
- m6id.metal
29+
- r5.metal
30+
- r5d.metal
31+
- r5n.metal
32+
- r5dn.metal
33+
- r6a.metal
34+
- r6i.metal
35+
- r6id.metal
36+
- x2iezn.metal
37+
- z1d.metal
38+
- c5.metal
39+
- c5d.metal
40+
- c5n.metal
41+
- c6a.metal
42+
- c6i.metal
43+
- c6id.metal
44+
- i3.metal
45+
- i3en.metal
46+
- r7i.48xlarge
47+
unhealthyConditions:
48+
- status: "False"
49+
timeout: 10s
50+
type: Ready
51+
- status: Unknown
52+
timeout: 10s
53+
type: Ready

0 commit comments

Comments
 (0)