openshift
diff --git a/‎pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/machineHealthCheckUnterminatedShortCircuitSRE.go
+262 b/‎pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/machineHealthCheckUnterminatedShortCircuitSRE.go
+262
diff --git a/‎pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/testing/README.md
+70 b/‎pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/testing/README.md
+70
diff --git a/‎pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/testing/srep-worker-healthcheck_machinehealthcheck.yaml
+53 b/‎pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/testing/srep-worker-healthcheck_machinehealthcheck.yaml
+53
@@ -0,0 +1,262 @@
+/*
+machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert
+*/
+package machinehealthcheckunterminatedshortcircuitsre
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	//"time"
+
+	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
+	k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
+	"github.com/openshift/configuration-anomaly-detection/pkg/logging"
+	"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
+
+	corev1 "k8s.io/api/core/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	//metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	machinev1beta1 "github.com/openshift/api/machine/v1beta1"
+)
+
+const (
+	alertname = "MachineHealthCheckUnterminatedShortCircuitSRE"
+	// remediationName must match the name of this investigation's directory, so it can be looked up via the backplane-api
+	remediationName = "machineHealthCheckUnterminatedShortCircuitSRE"
+
+	machineNamespace  = "openshift-machine-api"
+	machineRoleLabel  = "machine.openshift.io/cluster-api-machine-role"
+	machineRoleWorker = "worker"
+)
+
+type Investigation struct{
+	kclient client.Client
+	notes   *notewriter.NoteWriter
+}
+
+func (i *Investigation) setup(r *investigation.Resources) error {
+	// Setup investigation
+	k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
+	if err != nil {
+		return fmt.Errorf("failed to initialize kubernetes client: %w", err)
+	}
+	i.kclient = k
+	i.notes = notewriter.New(r.Name, logging.RawLogger)
+
+	return nil
+}
+
+func (i *Investigation) cleanup(r *investigation.Resources) error {
+		return k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
+}
+
+// Run investigates the MachineHealthCheckUnterminatedShortCircuitSRE alert
+func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
+	result := investigation.InvestigationResult{}
+
+	// Setup & teardown
+	err := i.setup(r)
+	if err != nil {
+		return result, fmt.Errorf("failed to setup investigation: %w", err)
+	}
+	defer func(r *investigation.Resources) {
+		err := i.cleanup(r)
+		if err != nil {
+			logging.Errorf("failed to cleanup investigation: %w", err)
+		}
+	}(r)
+
+	// Execute investigation
+
+	recommendations := []summary{}
+
+	// Examine machines - in addition to broken nodes, machines in the 'Failing' phase are counted against a machinehealthcheck's maxUnhealthy count:
+	// https://github.com/openshift/machine-api-operator/blob/e4bd10f78bada4cc8b36236e9b0b1c1332e5ef88/pkg/controller/machinehealthcheck/machinehealthcheck_controller.go#L764
+	failedMachines, err := i.getFailingMachines()
+	if err != nil {
+		logging.Errorf("failed to retrieve machines: %w", err)
+		i.notes.AppendWarning("failed to retrieve machines: %v", err)
+	}
+	for _, machine := range failedMachines {
+		// Confirm only worker machines are failing - if Red Hat-managed machines are affected, forward to Primary
+		role, err := i.getMachineRole(machine)
+		if err != nil {
+			// Failing to determine whether a machine is Red Hat-managed warrants human investigation
+			logging.Error("failed to determine machine role: %w", err)
+			i.notes.AppendWarning("failed to determine machine role: %v\nEscalating to Primary", err)
+			i.notes.AppendWarning("Primary: one or more machines was detected as missing the %q label, which can impact machine-api functionality. Please investigate the issue and take any appropriate action to address this.", machineRoleLabel)
+			return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
+		}
+		if role != machineRoleWorker {
+			logging.Error("found non-worker machine in %q state; escalating incident to Primary", err)
+			i.notes.AppendWarning("found non-worker machine in %q state; escalating incident to Primary", *machine.Status.Phase)
+			i.notes.AppendWarning("Primary: one or more Red Hat-managed machines was detected to have a .Status.Phase of %q, which can impact SLOs. Please investigate the issue and take any appropriate action to address this.", *machine.Status.Phase)
+			return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
+		}
+
+		recommendation, err := i.machineRecommendation(machine)
+		if err != nil {
+			logging.Error("failed to make recommendation for node %q: %w", machine.Name, err)
+			i.notes.AppendWarning("failed to make recommendation for node %q: %v", machine.Name, err)
+		} else {
+			recommendations = append(recommendations, recommendation)
+		}
+	}
+
+	// Examine nodes
+	notReadyNodes, err := i.getNotReadyNodes()
+	if err != nil {
+		logging.Error("failed to retrieve nodes: %w", err)
+		i.notes.AppendWarning("failed to retrieve nodes: %v", err)
+	}
+	for _, node := range notReadyNodes {
+		if i.nodeMachineRemediated(node, failedMachines) {
+			// Don't bother double checking nodes whose machine we've already investigated
+			continue
+		}
+
+		recommendation, err := i.nodeRecommendation(node)
+		if err != nil {
+			logging.Errorf("failed to make recommendation for node %q: %w", node.Name, err)
+			i.notes.AppendWarning("failed to make recommendation for node %q: %v", node.Name, err)
+		} else {
+			recommendations = append(recommendations, recommendation)
+		}
+	}
+
+	recommendationMsg := "the following action(s) are recommended:"
+	for _, recommendation := range recommendations {
+		recommendationMsg = fmt.Sprintf("%s\n  - %s", recommendationMsg, recommendation)
+	}
+	i.notes.AppendWarning(recommendationMsg)
+	return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
+}
+
+func (i *Investigation) nodeMachineRemediated(node corev1.Node, remediatedMachines []machinev1beta1.Machine) bool {
+	for _, machine := range remediatedMachines {
+		if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == node.Name {
+			return true
+		}
+	}
+	return false
+}
+
+func (i *Investigation) getMachineRole(machine machinev1beta1.Machine) (string, error) {
+	role, found := machine.Labels[machineRoleLabel]
+	if !found {
+		return "", fmt.Errorf("expected label '%s' not found", machineRoleLabel)
+	}
+	return role, nil
+}
+
+func (i *Investigation) getFailingMachines() ([]machinev1beta1.Machine, error) {
+	machines := &machinev1beta1.MachineList{}
+	listOptions := &client.ListOptions{Namespace: machineNamespace}
+	err := i.kclient.List(context.TODO(), machines, listOptions)
+	if err != nil {
+		return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machines from cluster: %w", err)
+	}
+
+	failed := []machinev1beta1.Machine{}
+	for _, machine := range machines.Items {
+		if *machine.Status.Phase == machinev1beta1.PhaseFailed || machine.Status.ErrorReason != nil {
+			failed = append(failed, machine)
+		}
+	}
+	return failed, nil
+}
+
+// summary provides a simple structure to pair each problem found with a recommended solution
+type summary struct {
+	issue          string
+	recommendation string
+}
+
+func (s summary) String() string {
+	return fmt.Sprintf("issue: %s\nrecommendation: %sn\n", s.issue, s.recommendation)
+}
+
+// machineRecommendation determines the recommended course of action for a machine
+func (i *Investigation) machineRecommendation(machine machinev1beta1.Machine) (summary, error) {
+	summary := summary{}
+	switch *machine.Status.ErrorReason {
+	case machinev1beta1.IPAddressInvalidReason:
+		summary.issue          = fmt.Sprintf("invalid IP address: %q", *machine.Status.ErrorMessage)
+		summary.recommendation = fmt.Sprintf("deleting the machine may allow the cloud provider to assign a valid IP address:\n\n  oc delete machine -n %s %s", machine.Namespace, machine.Name)
+	case machinev1beta1.CreateMachineError:
+		summary.issue          = fmt.Sprintf("machine failed to create: %q", *machine.Status.ErrorMessage)
+		summary.recommendation = fmt.Sprintf("deleteing the machine may bypass any transient issue with the cloud provider:\n\n  oc delete machine -n %s %s", machine.Namespace, machine.Name)
+	case machinev1beta1.InvalidConfigurationMachineError:
+		summary.issue          = fmt.Sprintf("machine configuration is invalid: %q", *machine.Status.ErrorMessage)
+		summary.recommendation = fmt.Sprintf("check audit history for cluster to determine whether a third-party has modified the machine or its machineset")
+	default:
+		summary.issue          = "no .Status.ErrorReason found for machine"
+		summary.recommendation = fmt.Sprintf("manual investigation for machine %s required", machine.Name)
+	}
+	return summary, nil
+}
+
+func (i *Investigation) getNotReadyNodes() ([]corev1.Node, error) {
+	nodes := &corev1.NodeList{}
+	err := i.kclient.List(context.TODO(), nodes)
+	if err != nil {
+		return []corev1.Node{}, fmt.Errorf("failed to retrieve nodes from cluster: %w", err)
+	}
+
+	notReady := []corev1.Node{}
+	for _, node := range nodes.Items {
+		readyCondition, found := i.findReadyCondition(node)
+		// Interpret no Ready condition as "unknown", though in reality this shouldn't ever happen
+		if !found || readyCondition.Status != corev1.ConditionTrue {
+			notReady = append(notReady, node)
+		}
+	}
+	return notReady, nil
+}
+
+func (i *Investigation) findReadyCondition(node corev1.Node) (corev1.NodeCondition, bool) {
+	for _, condition := range node.Status.Conditions {
+		if condition.Type == corev1.NodeReady {
+			return condition, true
+		}
+	}
+	return corev1.NodeCondition{}, false
+}
+
+//func (i *Investigation) nodeRecommendation(node corev1.Node) (string, error) {
+func (i *Investigation) nodeRecommendation(node corev1.Node) (summary, error) {
+	// TODO
+	summary := summary{}
+	ready, found := i.findReadyCondition(node)
+	if !found {
+		summary.issue = "node has no Ready condition set"
+		summary.recommendation = "manual investigation required to determine why node %q does not contain a Ready .Status.Condition"
+		return summary, nil
+	}
+
+	lastCheckinElapsed := time.Since(ready.LastHeartbeatTime.Time)
+	summary.issue = fmt.Sprintf("node %q has been %q for %s", node.Name, ready.Status, lastCheckinElapsed)
+	summary.recommendation = "manual investigation required"
+	return summary, nil
+}
+
+func (i *Investigation) Name() string {
+	return alertname
+}
+
+func (i *Investigation) Description() string {
+	return fmt.Sprintf("Investigates '%s' alerts", alertname)
+}
+
+func (i *Investigation) IsExperimental() bool {
+	return true
+}
+
+func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
+	return strings.Contains(alert, alertname)
+}
@@ -0,0 +1,70 @@
+# Testing MachineHealthCheckUnterminatedShortCircuitSRE
+
+The `MachineHealthCheckUnterminatedShortCircuitSRE` alert is derived from the `MachineHealthCheck` objects in the `openshift-machine-api` namespace on the cluster. Specifically, it is triggered when the number of nodes matching one of the `.spec.unhealthyConditions` meets or exceeds the `.spec.maxUnhealthy` value for a [duration of time](https://github.com/openshift/managed-cluster-config/blob/3338dd375fa6517d7768eca985c3ca115bbc1484/deploy/sre-prometheus/100-machine-health-check-unterminated-short-circuit.PrometheusRule.yaml#L16).
+
+
+## Setup
+
+Before applying any test configuration, be sure to [pause hive syncsetting](https://github.com/openshift/ops-sop/blob/master/v4/knowledge_base/pause-syncset.md), to avoid having these changes overwritten mid-test.
+
+Then, apply the following changes to the `MachineHealthCheck` object(s) you wish to check against, in order to make testing a little easier:
+- Reducing the `.spec.maxUnhealthy` value will lower the number of nodes that need to be "broken" to short-circuit the machine-api-operator and halt remediation
+- Reducing the `.spec.unhealthyConditions` timeouts ensures that the machine-api short-circuits much more quickly after modifying the nodes
+
+A patched version of the default `MachineHealthCheck/srep-worker-healthcheck` object is pre-configured [here](./srep-worker-healthcheck_machinehealthcheck.yaml). Use the following command to apply it to your test cluster:
+
+```sh
+ocm backplane elevate "testing CAD" -- replace -f ./srep-worker-healthcheck_machinehealthcheck.yaml
+```
+
+## Test Cases
+
+Because it has a fairly broad definition, a `MachineHealthCheckUnterminatedShortCircuitSRE` alert could fire as a result of several different scenarios. A few are outlined below, along with methods to reproduce them.
+
+### nodes `NotReady`, machines `Running`
+
+While the `machine-api-operator` owns and operates the `machine` object-type, it's important to note that its `MachineHealthCheck` objects actually utilize the `.status` of the corresponding **node** to determine if a machine is healthy. This is because a machine's status only reflects whether the VM in the cloud provider is running or not, while the node's status indicates whether the instance is a functional part of the cluster. Therefore, it's possible for a `MachineHealthCheckUnterminatedShortCircuitSRE` alert to fire while all `machines` have a `.status.phase` of `Running`.
+
+The simplest way to reproduce this is to login onto the node and stop the `kubelet.service` on multiple nodes at once.
+
+
+This can be done via the debug command:
+
+```sh
+ocm backplane elevate "testing CAD" -- debug node/$NODE
+```
+
+inside the container, run:
+
+```sh
+chroot /host
+systemctl stop kubelet.service
+```
+
+This should automatically remove the debug pod. The node status should flip to `NotReady` shortly thereafter.
+
+### Machines in `Failed` phase
+
+Having several machines in a `Failed` state still violates a `MachineHealthCheck`'s `.status.maxUnhealthy`, despite the machines not having any corresponding nodes to check against.
+
+One method to simulate this is to edit the machineset so it contains invalid configurations. The following patch updates all worker machinesets to use the `fakeinstancetype` machine-type, for example:
+
+```sh
+ocm backplane elevate "testing CAD" -- patch machinesets $MACHINESET -n openshift-machine-api --type merge -p '{"spec": {"template": {"spec": {"providerSpec": {"value": {"instanceType": "fakeinstancetype"}}}}}}'
+oc delete machine -n openshift-machine-api -l machine.openshift.io/cluster-api-machineset=$MACHINESET
+```
+
+### Machines with no phase
+
+TODO - does this trigger a healthcheck alert?
+
+Remove operator role from AWS IAM > roles page and see if it triggers the alert
+
+
+## Additional Resources
+The following pages may be useful if the information in this guide is insufficient or has become stale.
+
+- [Machine API Brief](https://github.com/openshift/machine-api-operator/blob/main/docs/user/machine-api-operator-overview.md)
+- [Machine API FAQ](https://github.com/openshift/machine-api-operator/blob/main/FAQ.md)
+- [MachineHealthCheck documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.17/html/machine_management/deploying-machine-health-checks#machine-health-checks-resource_deploying-machine-health-checks)
+- [Alert SOP](https://github.com/openshift/ops-sop/blob/master/v4/alerts/MachineHealthCheckUnterminatedShortCircuitSRE.md)
@@ -0,0 +1,53 @@
+apiVersion: machine.openshift.io/v1beta1
+kind: MachineHealthCheck
+metadata:
+  name: srep-worker-healthcheck
+  namespace: openshift-machine-api
+spec:
+  maxUnhealthy: 0
+  nodeStartupTimeout: 25m
+  selector:
+    matchExpressions:
+    - key: machine.openshift.io/cluster-api-machine-role
+      operator: NotIn
+      values:
+      - infra
+      - master
+    - key: machine.openshift.io/cluster-api-machineset
+      operator: Exists
+    - key: machine.openshift.io/instance-type
+      operator: NotIn
+      values:
+      - m5.metal
+      - m5d.metal
+      - m5n.metal
+      - m5dn.metal
+      - m5zn.metal
+      - m6a.metal
+      - m6i.metal
+      - m6id.metal
+      - r5.metal
+      - r5d.metal
+      - r5n.metal
+      - r5dn.metal
+      - r6a.metal
+      - r6i.metal
+      - r6id.metal
+      - x2iezn.metal
+      - z1d.metal
+      - c5.metal
+      - c5d.metal
+      - c5n.metal
+      - c6a.metal
+      - c6i.metal
+      - c6id.metal
+      - i3.metal
+      - i3en.metal
+      - r7i.48xlarge
+  unhealthyConditions:
+  - status: "False"
+    timeout: 10s
+    type: Ready
+  - status: Unknown
+    timeout: 10s
+    type: Ready