|
| 1 | +/* |
| 2 | +machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert |
| 3 | +*/ |
| 4 | +package machinehealthcheckunterminatedshortcircuitsre |
| 5 | + |
| 6 | +import ( |
| 7 | + "context" |
| 8 | + "fmt" |
| 9 | + "strings" |
| 10 | + "time" |
| 11 | + |
| 12 | + //"time" |
| 13 | + |
| 14 | + "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" |
| 15 | + k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" |
| 16 | + "github.com/openshift/configuration-anomaly-detection/pkg/logging" |
| 17 | + "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" |
| 18 | + |
| 19 | + corev1 "k8s.io/api/core/v1" |
| 20 | + "sigs.k8s.io/controller-runtime/pkg/client" |
| 21 | + |
| 22 | + //metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 23 | + |
| 24 | + machinev1beta1 "github.com/openshift/api/machine/v1beta1" |
| 25 | +) |
| 26 | + |
| 27 | +const ( |
| 28 | + alertname = "MachineHealthCheckUnterminatedShortCircuitSRE" |
| 29 | + // remediationName must match the name of this investigation's directory, so it can be looked up via the backplane-api |
| 30 | + remediationName = "machineHealthCheckUnterminatedShortCircuitSRE" |
| 31 | + |
| 32 | + machineNamespace = "openshift-machine-api" |
| 33 | + machineRoleLabel = "machine.openshift.io/cluster-api-machine-role" |
| 34 | + machineRoleWorker = "worker" |
| 35 | +) |
| 36 | + |
| 37 | +type Investigation struct{ |
| 38 | + kclient client.Client |
| 39 | + notes *notewriter.NoteWriter |
| 40 | +} |
| 41 | + |
| 42 | +func (i *Investigation) setup(r *investigation.Resources) error { |
| 43 | + // Setup investigation |
| 44 | + k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName) |
| 45 | + if err != nil { |
| 46 | + return fmt.Errorf("failed to initialize kubernetes client: %w", err) |
| 47 | + } |
| 48 | + i.kclient = k |
| 49 | + i.notes = notewriter.New(r.Name, logging.RawLogger) |
| 50 | + |
| 51 | + return nil |
| 52 | +} |
| 53 | + |
| 54 | +func (i *Investigation) cleanup(r *investigation.Resources) error { |
| 55 | + return k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName) |
| 56 | +} |
| 57 | + |
| 58 | +// Run investigates the MachineHealthCheckUnterminatedShortCircuitSRE alert |
| 59 | +func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { |
| 60 | + result := investigation.InvestigationResult{} |
| 61 | + |
| 62 | + // Setup & teardown |
| 63 | + err := i.setup(r) |
| 64 | + if err != nil { |
| 65 | + return result, fmt.Errorf("failed to setup investigation: %w", err) |
| 66 | + } |
| 67 | + defer func(r *investigation.Resources) { |
| 68 | + err := i.cleanup(r) |
| 69 | + if err != nil { |
| 70 | + logging.Errorf("failed to cleanup investigation: %w", err) |
| 71 | + } |
| 72 | + }(r) |
| 73 | + |
| 74 | + // Execute investigation |
| 75 | + |
| 76 | + recommendations := []summary{} |
| 77 | + |
| 78 | + // Examine machines - in addition to broken nodes, machines in the 'Failing' phase are counted against a machinehealthcheck's maxUnhealthy count: |
| 79 | + // https://github.com/openshift/machine-api-operator/blob/e4bd10f78bada4cc8b36236e9b0b1c1332e5ef88/pkg/controller/machinehealthcheck/machinehealthcheck_controller.go#L764 |
| 80 | + failedMachines, err := i.getFailingMachines() |
| 81 | + if err != nil { |
| 82 | + logging.Errorf("failed to retrieve machines: %w", err) |
| 83 | + i.notes.AppendWarning("failed to retrieve machines: %v", err) |
| 84 | + } |
| 85 | + for _, machine := range failedMachines { |
| 86 | + // Confirm only worker machines are failing - if Red Hat-managed machines are affected, forward to Primary |
| 87 | + role, err := i.getMachineRole(machine) |
| 88 | + if err != nil { |
| 89 | + // Failing to determine whether a machine is Red Hat-managed warrants human investigation |
| 90 | + logging.Error("failed to determine machine role: %w", err) |
| 91 | + i.notes.AppendWarning("failed to determine machine role: %v\nEscalating to Primary", err) |
| 92 | + i.notes.AppendWarning("Primary: one or more machines was detected as missing the %q label, which can impact machine-api functionality. Please investigate the issue and take any appropriate action to address this.", machineRoleLabel) |
| 93 | + return result, r.PdClient.EscalateIncidentWithNote(i.notes.String()) |
| 94 | + } |
| 95 | + if role != machineRoleWorker { |
| 96 | + logging.Error("found non-worker machine in %q state; escalating incident to Primary", err) |
| 97 | + i.notes.AppendWarning("found non-worker machine in %q state; escalating incident to Primary", *machine.Status.Phase) |
| 98 | + i.notes.AppendWarning("Primary: one or more Red Hat-managed machines was detected to have a .Status.Phase of %q, which can impact SLOs. Please investigate the issue and take any appropriate action to address this.", *machine.Status.Phase) |
| 99 | + return result, r.PdClient.EscalateIncidentWithNote(i.notes.String()) |
| 100 | + } |
| 101 | + |
| 102 | + recommendation, err := i.machineRecommendation(machine) |
| 103 | + if err != nil { |
| 104 | + logging.Error("failed to make recommendation for node %q: %w", machine.Name, err) |
| 105 | + i.notes.AppendWarning("failed to make recommendation for node %q: %v", machine.Name, err) |
| 106 | + } else { |
| 107 | + recommendations = append(recommendations, recommendation) |
| 108 | + } |
| 109 | + } |
| 110 | + |
| 111 | + // Examine nodes |
| 112 | + notReadyNodes, err := i.getNotReadyNodes() |
| 113 | + if err != nil { |
| 114 | + logging.Error("failed to retrieve nodes: %w", err) |
| 115 | + i.notes.AppendWarning("failed to retrieve nodes: %v", err) |
| 116 | + } |
| 117 | + for _, node := range notReadyNodes { |
| 118 | + if i.nodeMachineRemediated(node, failedMachines) { |
| 119 | + // Don't bother double checking nodes whose machine we've already investigated |
| 120 | + continue |
| 121 | + } |
| 122 | + |
| 123 | + recommendation, err := i.nodeRecommendation(node) |
| 124 | + if err != nil { |
| 125 | + logging.Errorf("failed to make recommendation for node %q: %w", node.Name, err) |
| 126 | + i.notes.AppendWarning("failed to make recommendation for node %q: %v", node.Name, err) |
| 127 | + } else { |
| 128 | + recommendations = append(recommendations, recommendation) |
| 129 | + } |
| 130 | + } |
| 131 | + |
| 132 | + recommendationMsg := "the following action(s) are recommended:" |
| 133 | + for _, recommendation := range recommendations { |
| 134 | + recommendationMsg = fmt.Sprintf("%s\n - %s", recommendationMsg, recommendation) |
| 135 | + } |
| 136 | + i.notes.AppendWarning(recommendationMsg) |
| 137 | + return result, r.PdClient.EscalateIncidentWithNote(i.notes.String()) |
| 138 | +} |
| 139 | + |
| 140 | +func (i *Investigation) nodeMachineRemediated(node corev1.Node, remediatedMachines []machinev1beta1.Machine) bool { |
| 141 | + for _, machine := range remediatedMachines { |
| 142 | + if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == node.Name { |
| 143 | + return true |
| 144 | + } |
| 145 | + } |
| 146 | + return false |
| 147 | +} |
| 148 | + |
| 149 | +func (i *Investigation) getMachineRole(machine machinev1beta1.Machine) (string, error) { |
| 150 | + role, found := machine.Labels[machineRoleLabel] |
| 151 | + if !found { |
| 152 | + return "", fmt.Errorf("expected label '%s' not found", machineRoleLabel) |
| 153 | + } |
| 154 | + return role, nil |
| 155 | +} |
| 156 | + |
| 157 | +func (i *Investigation) getFailingMachines() ([]machinev1beta1.Machine, error) { |
| 158 | + machines := &machinev1beta1.MachineList{} |
| 159 | + listOptions := &client.ListOptions{Namespace: machineNamespace} |
| 160 | + err := i.kclient.List(context.TODO(), machines, listOptions) |
| 161 | + if err != nil { |
| 162 | + return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machines from cluster: %w", err) |
| 163 | + } |
| 164 | + |
| 165 | + failed := []machinev1beta1.Machine{} |
| 166 | + for _, machine := range machines.Items { |
| 167 | + if *machine.Status.Phase == machinev1beta1.PhaseFailed || machine.Status.ErrorReason != nil { |
| 168 | + failed = append(failed, machine) |
| 169 | + } |
| 170 | + } |
| 171 | + return failed, nil |
| 172 | +} |
| 173 | + |
| 174 | +// summary provides a simple structure to pair each problem found with a recommended solution |
| 175 | +type summary struct { |
| 176 | + issue string |
| 177 | + recommendation string |
| 178 | +} |
| 179 | + |
| 180 | +func (s summary) String() string { |
| 181 | + return fmt.Sprintf("issue: %s\nrecommendation: %sn\n", s.issue, s.recommendation) |
| 182 | +} |
| 183 | + |
| 184 | +// machineRecommendation determines the recommended course of action for a machine |
| 185 | +func (i *Investigation) machineRecommendation(machine machinev1beta1.Machine) (summary, error) { |
| 186 | + summary := summary{} |
| 187 | + switch *machine.Status.ErrorReason { |
| 188 | + case machinev1beta1.IPAddressInvalidReason: |
| 189 | + summary.issue = fmt.Sprintf("invalid IP address: %q", *machine.Status.ErrorMessage) |
| 190 | + summary.recommendation = fmt.Sprintf("deleting the machine may allow the cloud provider to assign a valid IP address:\n\n oc delete machine -n %s %s", machine.Namespace, machine.Name) |
| 191 | + case machinev1beta1.CreateMachineError: |
| 192 | + summary.issue = fmt.Sprintf("machine failed to create: %q", *machine.Status.ErrorMessage) |
| 193 | + summary.recommendation = fmt.Sprintf("deleteing the machine may bypass any transient issue with the cloud provider:\n\n oc delete machine -n %s %s", machine.Namespace, machine.Name) |
| 194 | + case machinev1beta1.InvalidConfigurationMachineError: |
| 195 | + summary.issue = fmt.Sprintf("machine configuration is invalid: %q", *machine.Status.ErrorMessage) |
| 196 | + summary.recommendation = fmt.Sprintf("check audit history for cluster to determine whether a third-party has modified the machine or its machineset") |
| 197 | + default: |
| 198 | + summary.issue = "no .Status.ErrorReason found for machine" |
| 199 | + summary.recommendation = fmt.Sprintf("manual investigation for machine %s required", machine.Name) |
| 200 | + } |
| 201 | + return summary, nil |
| 202 | +} |
| 203 | + |
| 204 | +func (i *Investigation) getNotReadyNodes() ([]corev1.Node, error) { |
| 205 | + nodes := &corev1.NodeList{} |
| 206 | + err := i.kclient.List(context.TODO(), nodes) |
| 207 | + if err != nil { |
| 208 | + return []corev1.Node{}, fmt.Errorf("failed to retrieve nodes from cluster: %w", err) |
| 209 | + } |
| 210 | + |
| 211 | + notReady := []corev1.Node{} |
| 212 | + for _, node := range nodes.Items { |
| 213 | + readyCondition, found := i.findReadyCondition(node) |
| 214 | + // Interpret no Ready condition as "unknown", though in reality this shouldn't ever happen |
| 215 | + if !found || readyCondition.Status != corev1.ConditionTrue { |
| 216 | + notReady = append(notReady, node) |
| 217 | + } |
| 218 | + } |
| 219 | + return notReady, nil |
| 220 | +} |
| 221 | + |
| 222 | +func (i *Investigation) findReadyCondition(node corev1.Node) (corev1.NodeCondition, bool) { |
| 223 | + for _, condition := range node.Status.Conditions { |
| 224 | + if condition.Type == corev1.NodeReady { |
| 225 | + return condition, true |
| 226 | + } |
| 227 | + } |
| 228 | + return corev1.NodeCondition{}, false |
| 229 | +} |
| 230 | + |
| 231 | +//func (i *Investigation) nodeRecommendation(node corev1.Node) (string, error) { |
| 232 | +func (i *Investigation) nodeRecommendation(node corev1.Node) (summary, error) { |
| 233 | + // TODO |
| 234 | + summary := summary{} |
| 235 | + ready, found := i.findReadyCondition(node) |
| 236 | + if !found { |
| 237 | + summary.issue = "node has no Ready condition set" |
| 238 | + summary.recommendation = "manual investigation required to determine why node %q does not contain a Ready .Status.Condition" |
| 239 | + return summary, nil |
| 240 | + } |
| 241 | + |
| 242 | + lastCheckinElapsed := time.Since(ready.LastHeartbeatTime.Time) |
| 243 | + summary.issue = fmt.Sprintf("node %q has been %q for %s", node.Name, ready.Status, lastCheckinElapsed) |
| 244 | + summary.recommendation = "manual investigation required" |
| 245 | + return summary, nil |
| 246 | +} |
| 247 | + |
| 248 | +func (i *Investigation) Name() string { |
| 249 | + return alertname |
| 250 | +} |
| 251 | + |
| 252 | +func (i *Investigation) Description() string { |
| 253 | + return fmt.Sprintf("Investigates '%s' alerts", alertname) |
| 254 | +} |
| 255 | + |
| 256 | +func (i *Investigation) IsExperimental() bool { |
| 257 | + return true |
| 258 | +} |
| 259 | + |
| 260 | +func (i *Investigation) ShouldInvestigateAlert(alert string) bool { |
| 261 | + return strings.Contains(alert, alertname) |
| 262 | +} |
0 commit comments