diff --git a/go.mod b/go.mod index 879b1b32..3186ca58 100644 --- a/go.mod +++ b/go.mod @@ -22,12 +22,12 @@ require ( github.com/prometheus/client_golang v1.22.0 github.com/prometheus/common v0.63.0 github.com/spf13/cobra v1.8.1 - github.com/stretchr/testify v1.10.0 go.uber.org/mock v0.4.0 go.uber.org/zap v1.27.0 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.31.3 k8s.io/apimachinery v0.31.3 + k8s.io/client-go v0.31.3 sigs.k8s.io/controller-runtime v0.19.0 ) @@ -123,7 +123,6 @@ require ( github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/sagikazarmark/locafero v0.7.0 // indirect @@ -155,7 +154,6 @@ require ( gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/cli-runtime v0.30.3 // indirect - k8s.io/client-go v0.31.3 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go new file mode 100644 index 00000000..c4b5af4a --- /dev/null +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre.go @@ -0,0 +1,329 @@ +/* +machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert +*/ +package machinehealthcheckunterminatedshortcircuitsre + +import ( + "context" + "errors" + "fmt" + "slices" + "strings" + "time" + + machinev1beta1 "github.com/openshift/api/machine/v1beta1" + + "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" + machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine" + nodeutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/node" + k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" + "github.com/openshift/configuration-anomaly-detection/pkg/logging" + "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + alertname = "MachineHealthCheckUnterminatedShortCircuitSRE" + // remediationName must match the name of this investigation's directory, so it can be looked up via the backplane-api + remediationName = "machineHealthCheckUnterminatedShortCircuitSRE" +) + +type Investigation struct { + // kclient provides access to on-cluster resources + kclient client.Client + // notes holds the messages that will be shared with Primary upon completion + notes *notewriter.NoteWriter + // recommendations holds the set of actions CAD recommends primary to take + recommendations investigationRecommendations +} + +func (i *Investigation) setup(r *investigation.Resources) error { + // Setup investigation + k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName) + if err != nil { + return fmt.Errorf("failed to initialize kubernetes client: %w", err) + } + i.kclient = k + i.notes = notewriter.New(r.Name, logging.RawLogger) + i.recommendations = investigationRecommendations{} + + return nil +} + +// Run investigates the MachineHealthCheckUnterminatedShortCircuitSRE alert +// +// The investigation seeks to provide exactly one recommended action per affected machine/node pair. +// The machine object is evaluated first, as it represents a lower-level object that could affect the health of the node. +// If the investigation determines that the breakage is occurring at the machine-level, the corresponding node is *not* investigated. +// After investigating all affected machines, potentially affected nodes are investigated. +func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { + ctx := context.Background() + result := investigation.InvestigationResult{} + + // Setup & teardown + err := i.setup(r) + if err != nil { + return result, fmt.Errorf("failed to setup investigation: %w", err) + } + defer func(r *investigation.Resources) { + err := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName) + if err != nil { + logging.Errorf("failed to cleanup investigation: %w", err) + } + }(r) + + targetMachines, err := i.getMachinesFromFailingMHC(ctx) + if err != nil { + i.notes.AppendWarning("failed to retrieve one or more target machines: %v", err) + } + if len(targetMachines) == 0 { + i.notes.AppendWarning("no machines found for short-circuited machinehealthcheck objects") + return result, r.PdClient.EscalateIncidentWithNote(i.notes.String()) + } + + problemMachines, err := i.InvestigateMachines(ctx, targetMachines) + if err != nil { + i.notes.AppendWarning("failed to investigate machines: %v", err) + } + + // Trim out the machines that we've already investigated and know have problems + // + // The purpose of this is to avoid re-investigating nodes whose machines were already investigated. Any node-level issue on a failing machine + // is most likely related to the machine itself, and providing duplicate/conflicting advice will only prove confusing to the responder + targetMachines = slices.DeleteFunc(targetMachines, func(targetMachine machinev1beta1.Machine) bool { + for _, problemMachine := range problemMachines { + if problemMachine.Name == targetMachine.Name { + return true + } + } + return false + }) + + // If one or more machines managed by the machinehealthcheck have not yet been identified as a problem, check on the machine's + // node to determine if there are node-level problems that need remediating + if len(targetMachines) > 0 { + targetNodes, err := machineutil.GetNodesForMachines(ctx, i.kclient, targetMachines) + if err != nil { + i.notes.AppendWarning("failed to retrieve one or more target nodes: %v", err) + } + if len(targetNodes) > 0 { + i.InvestigateNodes(targetNodes) + } + } + + // Summarize recommendations from investigation in PD notes, if any found + if len(i.recommendations) > 0 { + i.notes.AppendWarning(i.recommendations.summarize()) + } else { + i.notes.AppendSuccess("no recommended actions to take against cluster") + } + + return result, r.PdClient.EscalateIncidentWithNote(i.notes.String()) +} + +func (i *Investigation) getMachinesFromFailingMHC(ctx context.Context) ([]machinev1beta1.Machine, error) { + healthchecks := machinev1beta1.MachineHealthCheckList{} + err := i.kclient.List(ctx, &healthchecks, &client.ListOptions{Namespace: machineutil.MachineNamespace}) + if err != nil { + return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machinehealthchecks from cluster: %w", err) + } + + targets := []machinev1beta1.Machine{} + for _, healthcheck := range healthchecks.Items { + if !machineutil.HealthcheckRemediationAllowed(healthcheck) { + machines, err := machineutil.GetMachinesForMHC(ctx, i.kclient, healthcheck) + if err != nil { + i.notes.AppendWarning("failed to retrieve machines from machinehealthcheck %q: %v", healthcheck.Name, err) + continue + } + targets = append(targets, machines...) + } + } + + return targets, nil +} + +// InvestigateMachines evaluates the state of the machines in the cluster and returns a list of the failing machines, along with a categorized set of recommendations based on the failure state of +// each machine +func (i *Investigation) InvestigateMachines(ctx context.Context, targets []machinev1beta1.Machine) ([]machinev1beta1.Machine, error) { + investigatedMachines := []machinev1beta1.Machine{} + var errs error + for _, machine := range targets { + if machine.DeletionTimestamp != nil { + err := i.investigateDeletingMachine(ctx, machine) + investigatedMachines = append(investigatedMachines, machine) + if err != nil { + errs = errors.Join(errs, fmt.Errorf("failed to investigate deleting machine: %w", err)) + } + continue + } + + if (machine.Status.Phase != nil && *machine.Status.Phase == machinev1beta1.PhaseFailed) || machine.Status.ErrorReason != nil { + err := i.investigateFailingMachine(machine) + investigatedMachines = append(investigatedMachines, machine) + if err != nil { + errs = errors.Join(errs, fmt.Errorf("failed to investigate failing machine: %w", err)) + } + } + } + + if len(investigatedMachines) == 0 { + i.notes.AppendSuccess("no deleting or Failed machines found") + } + return investigatedMachines, errs +} + +// investigateFailingMachin evaluates a machine whose .Status.Phase is failed, and provides a recommendation based on the cause of the failure +func (i *Investigation) investigateFailingMachine(machine machinev1beta1.Machine) error { + role, err := machineutil.GetRole(machine) + if err != nil { + // Failing to determine whether a machine is Red Hat-managed warrants manual investigation + notes := fmt.Sprintf("unable to determine machine role: %v", err) + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) + return fmt.Errorf("failed to determine role for machine %q: %w", machine.Name, err) + } + + // Safely dereference status fields to avoid panics in cases where machines' statuses haven't been fully populated + var errorMsg string + if machine.Status.ErrorMessage != nil { + errorMsg = *machine.Status.ErrorMessage + } + + var errorReason machinev1beta1.MachineStatusError + if machine.Status.ErrorReason != nil { + errorReason = *machine.Status.ErrorReason + } + + if role != machineutil.WorkerRoleLabelValue { + // If machine is Red-Hat owned, always require manual investigation + notes := fmt.Sprintf("Red Hat-owned machine in state %q due to %q", errorReason, errorMsg) + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) + return nil + } + + switch errorReason { + case machinev1beta1.IPAddressInvalidReason: + notes := fmt.Sprintf("invalid IP address: %q. Deleting machine may allow the cloud provider to assign a valid IP address", errorMsg) + i.recommendations.addRecommendation(recommendationDeleteMachine, machine.Name, notes) + + case machinev1beta1.CreateMachineError: + notes := fmt.Sprintf("machine failed to create: %q. Deleting machine may resolve any transient issues with the cloud provider", errorMsg) + i.recommendations.addRecommendation(recommendationDeleteMachine, machine.Name, notes) + + case machinev1beta1.InvalidConfigurationMachineError: + notes := fmt.Sprintf("the machine configuration is invalid: %q. Checking splunk audit logs may indicate whether the customer has modified the machine or its machineset", errorMsg) + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) + + case machinev1beta1.DeleteMachineError: + notes := fmt.Sprintf("the machine's node could not be gracefully terminated automatically: %q", errorMsg) + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) + + case machinev1beta1.InsufficientResourcesMachineError: + notes := fmt.Sprintf("a servicelog should be sent because there is insufficient quota to provision the machine: %q", errorMsg) + i.recommendations.addRecommendation(recommendationQuotaServiceLog, machine.Name, notes) + + default: + notes := "no .Status.ErrorReason found for machine" + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) + } + return nil +} + +// InvestigateDeletingMachines evaluates machines which are being deleted, to determine if & why they are blocked, along with a recommendation on how to unblock them +func (i *Investigation) investigateDeletingMachine(ctx context.Context, machine machinev1beta1.Machine) error { + if machine.Status.NodeRef == nil { + notes := "machine stuck deleting with no node" + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) + return nil + } + node, err := machineutil.GetNodeForMachine(ctx, i.kclient, machine) + if err != nil { + notes := "machine's node could not be determined" + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) + return fmt.Errorf("failed to retrieve node for machine %q: %w", machine.Name, err) + } + + stuck, duration := checkForStuckDrain(node) + if stuck { + notes := fmt.Sprintf("node %q found to be stuck draining for %s", node.Name, duration.Truncate(time.Second).String()) + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) + return nil + } + + notes := "unable to determine why machine is stuck deleting" + i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes) + return nil +} + +// checkForStuckDrain makes a best-effort approximation at whether a node is stuck draining a specific pod +func checkForStuckDrain(node corev1.Node) (bool, *time.Duration) { + if len(node.Spec.Taints) == 0 { + return false, nil + } + + taint, found := nodeutil.FindNoScheduleTaint(node) + if !found { + return false, nil + } + + // TODO - Once CAD can access on-cluster metrics, we can query the `pods_preventing_node_drain` metric from prometheus + // to more accurately gauge if a node is truly stuck deleting, and what pod is causing it + drainDuration := time.Since(taint.TimeAdded.Time) + if drainDuration > 10*time.Minute { + return true, &drainDuration + } + + return false, nil +} + +// InvestigateNodes examines the provided nodes and returns recommended actions, if any are needed +func (i *Investigation) InvestigateNodes(nodes []corev1.Node) { + for _, node := range nodes { + i.InvestigateNode(node) + } +} + +// InvestigateNode examines a node and determines if further investigation is required +func (i *Investigation) InvestigateNode(node corev1.Node) { + roleLabel, found := nodeutil.GetRole(node) + if !found { + notes := fmt.Sprintf("no role label containing %q found for node", nodeutil.RoleLabelPrefix) + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) + return + } else if !strings.Contains(roleLabel, nodeutil.WorkerRoleSuffix) { + notes := "non-worker node affected" + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) + return + } + + ready, found := nodeutil.FindReadyCondition(node) + if !found { + notes := "found no 'Ready' .Status.Condition for the node" + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) + return + } + + if ready.Status != corev1.ConditionTrue { + lastCheckinElapsed := time.Since(ready.LastHeartbeatTime.Time).Truncate(time.Second) + notes := fmt.Sprintf("node has been %q for %s", ready.Status, lastCheckinElapsed) + i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes) + } +} + +func (i *Investigation) Name() string { + return alertname +} + +func (i *Investigation) Description() string { + return fmt.Sprintf("Investigates '%s' alerts", alertname) +} + +func (i *Investigation) IsExperimental() bool { + return true +} + +func (i *Investigation) ShouldInvestigateAlert(alert string) bool { + return strings.Contains(alert, alertname) +} diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre_test.go b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre_test.go new file mode 100644 index 00000000..4016f327 --- /dev/null +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/machinehealthcheckunterminatedshortcircuitsre_test.go @@ -0,0 +1,573 @@ +package machinehealthcheckunterminatedshortcircuitsre + +import ( + "context" + "fmt" + "slices" + "strings" + "testing" + "time" + + machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine" + nodeutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/node" + "github.com/openshift/configuration-anomaly-detection/pkg/logging" + "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" + + machinev1beta1 "github.com/openshift/api/machine/v1beta1" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestInvestigation_getMachinesFromFailingMHC(t *testing.T) { + // Test objects + mhc := newFailingMachineHealthCheck() + + mhcOwnedMachine1 := newWorkerMachine("mhc-owned-1") + mhcOwnedMachine2 := newWorkerMachine("mhc-owned-2") + irrelevantMachine := newWorkerMachine("irrelevant") + // clear the labels from this machine so that it's no longer owned by the mhc + irrelevantMachine.Labels = map[string]string{} + + tests := []struct { + name string + objects []client.Object + want []string + }{ + // test cases + { + name: "one valid machine", + objects: []client.Object{ + mhc, + mhcOwnedMachine1, + }, + want: []string{ + mhcOwnedMachine1.Name, + }, + }, + { + name: "no valid machines", + objects: []client.Object{ + mhc, + }, + want: []string{}, + }, + { + name: "two valid machines", + objects: []client.Object{ + mhc, + mhcOwnedMachine1, + mhcOwnedMachine2, + }, + want: []string{ + mhcOwnedMachine1.Name, + mhcOwnedMachine2.Name, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Setup test + i, err := newTestInvestigation(tt.objects...) + if err != nil { + t.Errorf("failed to create test investigation: %v", err) + return + } + + // Execute test + got, err := i.getMachinesFromFailingMHC(context.TODO()) + if err != nil { + // Never expect error on this test + t.Errorf("Investigation.getTargetMachines() error = %v, expected no error", err) + return + } + + // Check results + for _, wantedMachineName := range tt.want { + found := slices.ContainsFunc(got, func(gotMachine machinev1beta1.Machine) bool { + return gotMachine.Name == wantedMachineName + }) + if !found { + t.Errorf("expected machine %q in returned list. Got the following instead: %#v", wantedMachineName, got) + } + } + }) + } +} + +func TestInvestigation_investigateFailingMachine(t *testing.T) { + // Test cases + type result struct { + err bool + action recommendedAction + notes string + } + + tests := []struct { + name string + machine func() *machinev1beta1.Machine + want result + }{ + { + name: "investigate when no role", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("role-less") + // clear the machine's labels to unassign it from the worker role + machine.Labels = map[string]string{} + return machine + }, + want: result{ + err: true, + action: recommendationInvestigateMachine, + notes: "unable to determine machine role", + }, + }, + { + name: "investigate when infra failing", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("infra") + machine.Labels[machineutil.RoleLabelKey] = "infra" + return machine + }, + want: result{ + err: false, + action: recommendationInvestigateMachine, + notes: "Red Hat-owned machine in state", + }, + }, + { + name: "delete when invalid IP", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("bad-ip") + reason := machinev1beta1.IPAddressInvalidReason + machine.Status.ErrorReason = &reason + return machine + }, + want: result{ + err: false, + action: recommendationDeleteMachine, + notes: "invalid IP address", + }, + }, + { + name: "delete when create failed", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("create-failed") + reason := machinev1beta1.CreateMachineError + machine.Status.ErrorReason = &reason + return machine + }, + want: result{ + err: false, + action: recommendationDeleteMachine, + notes: "machine failed to create", + }, + }, + { + name: "investigate when invalid configuration", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("invalid-config") + reason := machinev1beta1.InvalidConfigurationMachineError + machine.Status.ErrorReason = &reason + return machine + }, + want: result{ + err: false, + action: recommendationInvestigateMachine, + notes: "machine configuration is invalid", + }, + }, + { + name: "investigate when delete failed", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("delete-failed") + reason := machinev1beta1.DeleteMachineError + machine.Status.ErrorReason = &reason + return machine + }, + want: result{ + err: false, + action: recommendationInvestigateMachine, + notes: "the machine's node could not be gracefully terminated", + }, + }, + { + name: "servicelog when insufficient resources", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("insufficient-resources") + reason := machinev1beta1.InsufficientResourcesMachineError + machine.Status.ErrorReason = &reason + return machine + }, + want: result{ + err: false, + action: recommendationQuotaServiceLog, + notes: "a servicelog should be sent", + }, + }, + { + name: "investigate when missing .Status.ErrorReason", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("no-status") + machine.Status = machinev1beta1.MachineStatus{} + return machine + }, + want: result{ + err: false, + action: recommendationInvestigateMachine, + notes: "no .Status.ErrorReason found for machine", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Setup test + machine := tt.machine() + i, err := newTestInvestigation(machine) + if err != nil { + t.Errorf("failed to create test investigation: %v", err) + } + + // Execute test + err = i.investigateFailingMachine(*machine) + + // Verify results + if (err != nil) != tt.want.err { + t.Errorf("unexpected result in Investigation.investigateFailingMachine(): wanted error = %t, returned err = %v", tt.want.err, err) + } + + recs := i.recommendations[tt.want.action] + if len(recs) != 1 { + t.Errorf("expected exactly one recommendation: %q, got %d", tt.want.action, len(recs)) + } + if !strings.Contains(recs[0].notes, tt.want.notes) { + t.Errorf("unexpected note in investigation recommendation: expected %q, got %q", tt.want.notes, recs[0].notes) + } + }) + } +} + +func TestInvestigation_investigateDeletingMachine(t *testing.T) { + type result struct { + err bool + action recommendedAction + notes string + } + + tests := []struct { + name string + objects func() (*machinev1beta1.Machine, *corev1.Node) + want result + }{ + { + name: "investigate when no .Status.NodeRef", + objects: func() (*machinev1beta1.Machine, *corev1.Node) { + machine := newWorkerMachine("no-noderef") + machine.Status = machinev1beta1.MachineStatus{} + return machine, &corev1.Node{} + }, + want: result{ + err: false, + action: recommendationInvestigateMachine, + notes: "machine stuck deleting with no node", + }, + }, + { + name: "investigate when no node for machine", + objects: func() (*machinev1beta1.Machine, *corev1.Node) { + machine := newWorkerMachine("lonely") + machine.Status.NodeRef = &corev1.ObjectReference{ + Kind: "Node", + Name: "missing", + } + return machine, &corev1.Node{} + }, + want: result{ + err: true, + action: recommendationInvestigateMachine, + notes: "machine's node could not be determined", + }, + }, + { + name: "investigate when machine deletion reason unknown", + objects: func() (*machinev1beta1.Machine, *corev1.Node) { + machine := newWorkerMachine("stuck-deleting") + node := newWorkerNode("stuck-deleting") + machine.Status.NodeRef = &corev1.ObjectReference{ + Kind: node.Kind, + Name: node.Name, + } + return machine, node + }, + want: result{ + err: false, + action: recommendationInvestigateMachine, + notes: "unable to determine why machine is stuck deleting", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Setup test + machine, node := tt.objects() + i, err := newTestInvestigation(machine, node) + if err != nil { + t.Errorf("failed to create test investigation: %v", err) + } + + // Execute test + err = i.investigateDeletingMachine(context.TODO(), *machine) + + // Verify results + if (err != nil) != tt.want.err { + t.Errorf("unexpected result in Investigation.investigateDeletingMachine(): wanted error = %t, returned err = %v", tt.want.err, err) + } + + recs := i.recommendations[tt.want.action] + if len(recs) != 1 { + t.Errorf("expected exactly one recommendation: %q, got %d", tt.want.action, len(recs)) + } + if !strings.Contains(recs[0].notes, tt.want.notes) { + t.Errorf("unexpected note in investigation recommendation: expected %q, got %q", tt.want.notes, recs[0].notes) + } + }) + } +} + +func Test_checkForStuckDrain(t *testing.T) { + tests := []struct { + name string + node func() corev1.Node + stuck bool + }{ + { + name: "not stuck when no taints", + node: func() corev1.Node { + node := newWorkerNode("taintless") + node.Spec.Taints = []corev1.Taint{} + return *node + }, + stuck: false, + }, + { + name: "not stuck when no NoSchedule taint", + node: func() corev1.Node { + node := newWorkerNode("scheduleable") + node.Spec.Taints = []corev1.Taint{ + { + Effect: corev1.TaintEffectNoExecute, + }, + { + Effect: corev1.TaintEffectPreferNoSchedule, + }, + } + return *node + }, + stuck: false, + }, + { + name: "not stuck when short-lived NoSchedule taint", + node: func() corev1.Node { + node := newWorkerNode("recently-unscheduleable") + node.Spec.Taints = []corev1.Taint{ + { + Effect: corev1.TaintEffectNoSchedule, + TimeAdded: &metav1.Time{ + Time: time.Now(), + }, + }, + } + return *node + }, + stuck: false, + }, + { + name: "stuck when long-lived NoSchedule taint", + node: func() corev1.Node { + node := newWorkerNode("actually-stuck") + node.Spec.Taints = []corev1.Taint{ + { + Effect: corev1.TaintEffectNoSchedule, + TimeAdded: &metav1.Time{ + Time: time.Now().Add(-1 * time.Hour), + }, + }, + } + return *node + }, + stuck: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, duration := checkForStuckDrain(tt.node()) + if got != tt.stuck { + t.Errorf("unexpected result: expected stuck = %t, got %t", tt.stuck, got) + } + if got && duration == nil { + t.Errorf("unexpected result: got stuck = %t, but duration = %v", got, duration) + } + }) + } +} + +func TestInvestigation_InvestigateNode(t *testing.T) { + type result struct { + action recommendedAction + notes string + } + + tests := []struct { + name string + node func() *corev1.Node + want result + }{ + { + name: "investigate when no role label", + node: func() *corev1.Node { + node := newWorkerNode("labelless") + node.Labels = map[string]string{} + return node + }, + want: result{ + action: recommendationInvestigateNode, + notes: "no role label", + }, + }, + { + name: "investigate when bad role label", + node: func() *corev1.Node { + node := newWorkerNode("infra") + node.Labels = map[string]string{fmt.Sprintf("%s/infra", nodeutil.RoleLabelPrefix): ""} + return node + }, + want: result{ + action: recommendationInvestigateNode, + notes: "non-worker node affected", + }, + }, + { + name: "investigate when no Ready condition", + node: func() *corev1.Node { + node := newWorkerNode("unReady") + node.Status.Conditions = []corev1.NodeCondition{} + return node + }, + want: result{ + action: recommendationInvestigateNode, + notes: "found no 'Ready' .Status.Condition", + }, + }, + { + name: "investigate when not Ready", + node: func() *corev1.Node { + node := newWorkerNode("notReady") + node.Status.Conditions = []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionFalse, + }, + } + return node + }, + want: result{ + action: recommendationInvestigateNode, + notes: "node has been", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Setup test + node := tt.node() + i, err := newTestInvestigation(node) + if err != nil { + t.Errorf("failed to create new test investigation: %v", err) + } + + // Execute test + i.InvestigateNode(*node) + + // Validate results + recs := i.recommendations[tt.want.action] + if len(recs) != 1 { + t.Errorf("expected exactly one recommendation: %q, got %d", tt.want.action, len(recs)) + } + if !strings.Contains(recs[0].notes, tt.want.notes) { + t.Errorf("unexpected note in investigation recommendation: expected %q, got %q", tt.want.notes, recs[0].notes) + } + }) + } +} + +func newFailingMachineHealthCheck() *machinev1beta1.MachineHealthCheck { + mhc := &machinev1beta1.MachineHealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-mhc", + Namespace: machineutil.MachineNamespace, + }, + Spec: machinev1beta1.MachineHealthCheckSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{"mhc-name": "test-mhc"}, + }, + }, + Status: machinev1beta1.MachineHealthCheckStatus{ + Conditions: []machinev1beta1.Condition{ + { + Type: machinev1beta1.RemediationAllowedCondition, + Status: corev1.ConditionFalse, + }, + }, + }, + } + return mhc +} + +func newWorkerMachine(name string) *machinev1beta1.Machine { + m := &machinev1beta1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: machineutil.MachineNamespace, + Labels: map[string]string{"mhc-name": "test-mhc", machineutil.RoleLabelKey: machineutil.WorkerRoleLabelValue}, + }, + } + return m +} + +func newWorkerNode(name string) *corev1.Node { + n := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{fmt.Sprintf("%s/%s", nodeutil.RoleLabelPrefix, nodeutil.WorkerRoleSuffix): ""}, + }, + } + return n +} + +func newFakeClient(objs ...client.Object) (client.Client, error) { + s := scheme.Scheme + err := machinev1beta1.AddToScheme(s) + if err != nil { + return nil, err + } + + client := fake.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build() + return client, nil +} + +func newTestInvestigation(testObjects ...client.Object) (Investigation, error) { + fakeClient, err := newFakeClient(testObjects...) + if err != nil { + return Investigation{}, err + } + + i := Investigation{ + kclient: fakeClient, + notes: notewriter.New("testing", logging.RawLogger), + recommendations: investigationRecommendations{}, + } + return i, nil +} diff --git a/pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/metadata.yaml b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/metadata.yaml similarity index 94% rename from pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/metadata.yaml rename to pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/metadata.yaml index 58056751..b8ec447e 100644 --- a/pkg/investigations/machineHealthCheckUnterminatedShortCircuitSRE/metadata.yaml +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/metadata.yaml @@ -19,4 +19,4 @@ rbac: - "" resources: - "nodes" -customerDataAccess: true +customerDataAccess: false diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/recommendation.go b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/recommendation.go new file mode 100644 index 00000000..467566b9 --- /dev/null +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/recommendation.go @@ -0,0 +1,73 @@ +/* +machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert +*/ +package machinehealthcheckunterminatedshortcircuitsre + +import ( + "fmt" + + "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine" +) + +// machineRecommendations categorizes each machine's individual investigation summary into a recommended course of action +type investigationRecommendations map[recommendedAction][]investigationResult + +func (r investigationRecommendations) addRecommendation(action recommendedAction, object string, notes string) { + recommendation := investigationResult{ + object: object, + notes: notes, + } + r[action] = append(r[action], recommendation) +} + +// summarize prints the machine investigationRecommendations into a human read-able format. +func (r investigationRecommendations) summarize() string { + msg := "" + for recommendation, investigations := range r { + msg += fmt.Sprintf("%s:\n", recommendation) + + if recommendation == recommendationDeleteMachine { + // Consolidate all machine deletion requests into a single oc command for ease of use + deleteCmd := fmt.Sprintf("oc delete machine -n %s", machine.MachineNamespace) + for _, investigation := range investigations { + msg += fmt.Sprintf("- %s\n", investigation.String()) + deleteCmd += " " + investigation.object + } + msg += fmt.Sprintf("to delete these machines, run:\n\n%s\n", deleteCmd) + } else { + for _, investigation := range investigations { + msg += fmt.Sprintf("- %s\n", investigation.String()) + } + } + + msg += "\n" + } + return msg +} + +type investigationResult struct { + // name indicates which object was investigated + object string + // notes provides a high-level summary of the investigation results + notes string +} + +func (s *investigationResult) String() string { + msg := fmt.Sprintf("%q: %s", s.object, s.notes) + return msg +} + +// recommendedAction acts as both a key in the investigationRecommendations map, as well as a header for pagerduty notes when summarize()-ing +type recommendedAction string + +const ( + // recommendationDeleteMachine indicates that the machine(s) in question should be deleted so the machine-api can reprovision them + recommendationDeleteMachine recommendedAction = "delete the following machines" + // recommendationInvestigateMachine indicates that the machine(s) in question need to be manually investigated + recommendationInvestigateMachine recommendedAction = "investigate the following machines" + // recommendationQuotaServiceLog indicates that the machine(s) in question need to be remediated by the customer, and SRE should notify them + // of that fact via servicelog + recommendationQuotaServiceLog recommendedAction = "send a service log regarding quota issues for the following machines" + // recommendationInvestigateNode indicates that the machine's node object is reporting problems which require human intervention to resolve + recommendationInvestigateNode recommendedAction = "investigate the following nodes" +) diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/README.md b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/README.md new file mode 100644 index 00000000..bb0846c0 --- /dev/null +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/README.md @@ -0,0 +1,83 @@ +# Testing MachineHealthCheckUnterminatedShortCircuitSRE + +The `MachineHealthCheckUnterminatedShortCircuitSRE` alert is derived from the `MachineHealthCheck` objects in the `openshift-machine-api` namespace on the cluster. Specifically, it is triggered when the number of nodes matching one of the `.spec.unhealthyConditions` meets or exceeds the `.spec.maxUnhealthy` value for a [duration of time](https://github.com/openshift/managed-cluster-config/blob/3338dd375fa6517d7768eca985c3ca115bbc1484/deploy/sre-prometheus/100-machine-health-check-unterminated-short-circuit.PrometheusRule.yaml#L16). + + +## Setup + +Before applying any test configuration, be sure to [pause hive syncsetting](https://github.com/openshift/ops-sop/blob/master/v4/knowledge_base/pause-syncset.md), to avoid having these changes overwritten mid-test. + +Then, apply the following changes to the `MachineHealthCheck` object(s) you wish to check against, in order to make testing a little easier: +- Reducing the `.spec.maxUnhealthy` value will lower the number of nodes that need to be "broken" to short-circuit the machine-api-operator and halt remediation +- Reducing the `.spec.unhealthyConditions` timeouts ensures that the machine-api short-circuits much more quickly after modifying the nodes + +A patched version of the default `MachineHealthCheck/srep-worker-healthcheck` object is pre-configured [here](./srep-worker-healthcheck_machinehealthcheck.yaml). Use the following command to apply it to your test cluster: + +```sh +ocm backplane elevate "testing CAD" -- replace -f ./srep-worker-healthcheck_machinehealthcheck.yaml +``` + +## Test Cases + +Because it has a fairly broad definition, a `MachineHealthCheckUnterminatedShortCircuitSRE` alert could fire as a result of several different scenarios. A few are outlined below, along with methods to reproduce them. + +### nodes `NotReady`, machines `Running` + +While the `machine-api-operator` owns and operates the `machine` object-type, it's important to note that its `MachineHealthCheck` objects actually utilize the `.status` of the corresponding **node** to determine if a machine is healthy. This is because a machine's status only reflects whether the VM in the cloud provider is running or not, while the node's status indicates whether the instance is a functional part of the cluster. Therefore, it's possible for a `MachineHealthCheckUnterminatedShortCircuitSRE` alert to fire while all `machines` have a `.status.phase` of `Running`. + +The simplest way to reproduce this is to login onto the node and stop the `kubelet.service` on multiple nodes at once. + + +This can be done via the debug command: + +```sh +ocm backplane elevate "testing CAD" -- debug node/$NODE +``` + +inside the container, run: + +```sh +chroot /host +systemctl stop kubelet.service +``` + +This should automatically remove the debug pod. The node status should flip to `NotReady` shortly thereafter. + +### Nodes stuck deleting due to customer workloads not draining + +Components like the cluster-autoscaler will try to automatically manage cluster machines via scale-up/scale-down actions based on overall resource utilization. If workloads cannot be drained during a scale-down operation, several nodes can get stuck in a `SchedulingDisabled` phase concurrently, triggering the alert. + +To simulate this, create a pod that will have to be drained prior to deleting the machine, alongside an (incorrectly configured) PDB that will prevent it from being drained: + +```sh +ocm backplane elevate "testing CAD" -- create -f ./unstoppable_workload.yaml -f ./unstoppable_pdb.yaml +``` + +Next, simulate a scale-down by patching the machineset the pod is running on. *NOTE*: Just deleting the machine will result in it being replaced, and will not trigger a MachineHealthCheckUnterminatedShortCircuitSRE alert. +```sh +# Get pod's node +NODE=$(oc get po -n default -l app=test-cad -o jsonpath="{.items[].spec.nodeName}") +# Get node's machineset +MACHINESET=$(oc get machines -A -o json | jq --arg NODE "${NODE}" -r '.items[] | select(.status.nodeRef.name == $NODE) | .metadata.labels["machine.openshift.io/cluster-api-machineset"]') +# Scale node's machineset +oc scale machineset/${MACHINESET} --replicas=0 -n openshift-machine-api +``` + +### Machines in `Failed` phase + +Having several machines in a `Failed` state still violates a `MachineHealthCheck`'s `.status.maxUnhealthy`, despite the machines not having any corresponding nodes to check against. + +One method to simulate this is to edit the machineset so it contains invalid configurations. The following patch updates a worker machineset to use the `fakeinstancetype` machine-type, for example: + +```sh +ocm backplane elevate "testing CAD" -- patch machinesets $MACHINESET -n openshift-machine-api --type merge -p '{"spec": {"template": {"spec": {"providerSpec": {"value": {"instanceType": "fakeinstancetype"}}}}}}' +oc delete machine -n openshift-machine-api -l machine.openshift.io/cluster-api-machineset=$MACHINESET +``` + +## Additional Resources +The following pages may be useful if the information in this guide is insufficient or has become stale. + +- [Machine API Brief](https://github.com/openshift/machine-api-operator/blob/main/docs/user/machine-api-operator-overview.md) +- [Machine API FAQ](https://github.com/openshift/machine-api-operator/blob/main/FAQ.md) +- [MachineHealthCheck documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.17/html/machine_management/deploying-machine-health-checks#machine-health-checks-resource_deploying-machine-health-checks) +- [Alert SOP](https://github.com/openshift/ops-sop/blob/master/v4/alerts/MachineHealthCheckUnterminatedShortCircuitSRE.md) diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/srep-worker-healthcheck_machinehealthcheck.yaml b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/srep-worker-healthcheck_machinehealthcheck.yaml new file mode 100644 index 00000000..9fd7da3b --- /dev/null +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/srep-worker-healthcheck_machinehealthcheck.yaml @@ -0,0 +1,53 @@ +apiVersion: machine.openshift.io/v1beta1 +kind: MachineHealthCheck +metadata: + name: srep-worker-healthcheck + namespace: openshift-machine-api +spec: + maxUnhealthy: 0 + nodeStartupTimeout: 25m + selector: + matchExpressions: + - key: machine.openshift.io/cluster-api-machine-role + operator: NotIn + values: + - infra + - master + - key: machine.openshift.io/cluster-api-machineset + operator: Exists + - key: machine.openshift.io/instance-type + operator: NotIn + values: + - m5.metal + - m5d.metal + - m5n.metal + - m5dn.metal + - m5zn.metal + - m6a.metal + - m6i.metal + - m6id.metal + - r5.metal + - r5d.metal + - r5n.metal + - r5dn.metal + - r6a.metal + - r6i.metal + - r6id.metal + - x2iezn.metal + - z1d.metal + - c5.metal + - c5d.metal + - c5n.metal + - c6a.metal + - c6i.metal + - c6id.metal + - i3.metal + - i3en.metal + - r7i.48xlarge + unhealthyConditions: + - status: "False" + timeout: 10s + type: Ready + - status: Unknown + timeout: 10s + type: Ready diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_pdb.yaml b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_pdb.yaml new file mode 100644 index 00000000..13d4a4d4 --- /dev/null +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_pdb.yaml @@ -0,0 +1,11 @@ +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: test-cad + namespace: default +spec: + maxUnavailable: 0 + selector: + matchLabels: + app: "test-cad" + unhealthyPodEvictionPolicy: AlwaysAllow diff --git a/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_workload.yaml b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_workload.yaml new file mode 100644 index 00000000..9d7d3af8 --- /dev/null +++ b/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre/testing/unstoppable_workload.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: "test-cad" + name: test-cad + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: "test-cad" + template: + metadata: + labels: + app: "test-cad" + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + weight: 1 + containers: + - command: + - "sleep" + - "infinity" + image: "quay.io/app-sre/ubi8-ubi:latest" + imagePullPolicy: IfNotPresent + name: test + restartPolicy: Always diff --git a/pkg/investigations/registry.go b/pkg/investigations/registry.go index a278b40b..bc0402bd 100644 --- a/pkg/investigations/registry.go +++ b/pkg/investigations/registry.go @@ -7,6 +7,7 @@ import ( "github.com/openshift/configuration-anomaly-detection/pkg/investigations/cpd" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/insightsoperatordown" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" + "github.com/openshift/configuration-anomaly-detection/pkg/investigations/machinehealthcheckunterminatedshortcircuitsre" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/upgradeconfigsyncfailureover4hr" ) @@ -18,6 +19,7 @@ var availableInvestigations = []investigation.Investigation{ &cpd.Investigation{}, &insightsoperatordown.Investigation{}, &upgradeconfigsyncfailureover4hr.Investigation{}, + &machinehealthcheckunterminatedshortcircuitsre.Investigation{}, } // GetInvestigation returns the first Investigation that applies to the given alert title. diff --git a/pkg/investigations/utils/machine/machine.go b/pkg/investigations/utils/machine/machine.go new file mode 100644 index 00000000..079fb116 --- /dev/null +++ b/pkg/investigations/utils/machine/machine.go @@ -0,0 +1,97 @@ +package machine + +import ( + "context" + "fmt" + + machinev1beta1 "github.com/openshift/api/machine/v1beta1" + "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/node" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + MachineNamespace = "openshift-machine-api" + RoleLabelKey = "machine.openshift.io/cluster-api-machine-role" + WorkerRoleLabelValue = "worker" +) + +// HealthcheckRemediationAllowed searches the status conditions for the machinehealthcheck object and determines if remediation is allowed +func HealthcheckRemediationAllowed(healthcheck machinev1beta1.MachineHealthCheck) bool { + for _, condition := range healthcheck.Status.Conditions { + if condition.Type == machinev1beta1.RemediationAllowedCondition && condition.Status == corev1.ConditionTrue { + // Only rule out that the mhc is failing if we can both find the condition and determine its current status + return true + } + } + return false +} + +// GetMachinesForMHC retrieves the machines managed by the given MachineHealthCheck object +func GetMachinesForMHC(ctx context.Context, kclient client.Client, healthcheck machinev1beta1.MachineHealthCheck) ([]machinev1beta1.Machine, error) { + machines := machinev1beta1.MachineList{} + selector, err := metav1.LabelSelectorAsSelector(&healthcheck.Spec.Selector) + if err != nil { + return []machinev1beta1.Machine{}, fmt.Errorf("failed to convert machinehealthcheck %q .spec.selector: %w", healthcheck.Name, err) + } + err = kclient.List(ctx, &machines, client.MatchingLabelsSelector{Selector: selector}, &client.ListOptions{Namespace: MachineNamespace}) + if err != nil { + return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machines from machinehealthcheck %q: %w", healthcheck.Name, err) + } + return machines.Items, nil +} + +// GetMachineRole returns the role of the given machine, if present. If not found, an error is returned +func GetRole(machine machinev1beta1.Machine) (string, error) { + role, found := machine.Labels[RoleLabelKey] + if !found { + return "", fmt.Errorf("expected label key %q not found", RoleLabelKey) + } + return role, nil +} + +// GetNodesForMachines retrieves the nodes for the given machines. Errors encountered are joined, but do not block the retrieval of other machines +func GetNodesForMachines(ctx context.Context, kclient client.Client, machines []machinev1beta1.Machine) ([]corev1.Node, error) { + // Retrieving all nodes initially & filtering out irrelevant objects results in fewer API calls + nodes, err := node.GetAll(ctx, kclient) + if err != nil { + return []corev1.Node{}, fmt.Errorf("failed to retrieve nodes: %w", err) + } + + matches := []corev1.Node{} + for _, machine := range machines { + node, found := findMatchingNode(machine, nodes) + if found { + matches = append(matches, node) + } + } + return matches, nil +} + +// findMatchingNode retrieves the node owned by the provided machine, if one exists, along with a boolean indicating whether +// the search succeeded +func findMatchingNode(machine machinev1beta1.Machine, nodes []corev1.Node) (corev1.Node, bool) { + if machine.Status.NodeRef == nil || machine.Status.NodeRef.Name == "" { + return corev1.Node{}, false + } + for _, node := range nodes { + if machine.Status.NodeRef.Name == node.Name { + return node, true + } + } + + return corev1.Node{}, false +} + +// GetNodeForMachine retrieves the node for the given machine. If the provided machine's .Status.NodeRef is empty, +// an error is returned +func GetNodeForMachine(ctx context.Context, kclient client.Client, machine machinev1beta1.Machine) (corev1.Node, error) { + if machine.Status.NodeRef == nil || machine.Status.NodeRef.Name == "" { + return corev1.Node{}, fmt.Errorf("no .Status.NodeRef defined for machine %q", machine.Name) + } + node := &corev1.Node{} + err := kclient.Get(ctx, types.NamespacedName{Name: machine.Status.NodeRef.Name}, node) + return *node, err +} diff --git a/pkg/investigations/utils/machine/machine_test.go b/pkg/investigations/utils/machine/machine_test.go new file mode 100644 index 00000000..8011c21e --- /dev/null +++ b/pkg/investigations/utils/machine/machine_test.go @@ -0,0 +1,339 @@ +package machine + +import ( + "context" + "testing" + + machinev1beta1 "github.com/openshift/api/machine/v1beta1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func Test_HealthcheckRemediationAllowed(t *testing.T) { + // Test objects + failingMhc := newFailingMachineHealthCheck() + + // modify the status conditions on this mhc to indicate it's actually working + workingMhc := newFailingMachineHealthCheck() + workingMhc.Status.Conditions = []machinev1beta1.Condition{ + { + Type: machinev1beta1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + } + + missingConditionMhc := newFailingMachineHealthCheck() + missingConditionMhc.Status.Conditions = []machinev1beta1.Condition{} + + // Test cases + tests := []struct { + name string + mhc machinev1beta1.MachineHealthCheck + want bool + }{ + { + name: "failing condition present", + mhc: *failingMhc, + want: false, + }, + { + name: "no condition present", + mhc: *missingConditionMhc, + want: false, + }, + { + name: "working condition present", + mhc: *workingMhc, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := HealthcheckRemediationAllowed(tt.mhc); got != tt.want { + t.Errorf("Investigation.healthcheckRemediationAllowed() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_GetMachineRole(t *testing.T) { + type result struct { + err bool + role string + } + + tests := []struct { + name string + machine func() *machinev1beta1.Machine + want result + }{ + { + name: "error when no role label", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("role-less") + machine.Labels = map[string]string{} + return machine + }, + want: result{ + err: true, + }, + }, + { + name: "role returned when present", + machine: func() *machinev1beta1.Machine { + machine := newWorkerMachine("worker") + machine.Labels = map[string]string{RoleLabelKey: WorkerRoleLabelValue} + return machine + }, + want: result{ + err: false, + role: WorkerRoleLabelValue, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + machine := tt.machine() + got, err := GetRole(*machine) + + if (err != nil) != tt.want.err { + t.Errorf("unexpected test result: wanted err = %t, returned err = %v", tt.want.err, err) + } + if got != tt.want.role { + t.Errorf("unexpected test result: wanted role = %q, got role = %q", tt.want.role, got) + } + }) + } +} + +func Test_findMatchingNode(t *testing.T) { + type result struct { + name string + found bool + } + + tests := []struct { + name string + objects func() (*machinev1beta1.Machine, []corev1.Node) + want result + }{ + { + name: "false when machine missing noderef", + objects: func() (*machinev1beta1.Machine, []corev1.Node) { + machine := newWorkerMachine("no-node") + machine.Status = machinev1beta1.MachineStatus{} + return machine, []corev1.Node{} + }, + want: result{ + found: false, + }, + }, + { + name: "false when machine nodeRef has no name", + objects: func() (*machinev1beta1.Machine, []corev1.Node) { + machine := newWorkerMachine("no-node") + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "", + } + return machine, []corev1.Node{} + }, + want: result{ + found: false, + }, + }, + { + name: "false when no matching node exists", + objects: func() (*machinev1beta1.Machine, []corev1.Node) { + machine := newWorkerMachine("lonely") + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "imaginary", + } + + nodes := []corev1.Node{*newWorkerNode("node1"), *newWorkerNode("node2")} + return machine, nodes + }, + want: result{ + found: false, + }, + }, + { + name: "found when node exists", + objects: func() (*machinev1beta1.Machine, []corev1.Node) { + machine := newWorkerMachine("machine1") + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "node1", + } + + nodes := []corev1.Node{*newWorkerNode("node1"), *newWorkerNode("node2")} + return machine, nodes + }, + want: result{ + found: true, + name: "node1", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Setup + machine, nodes := tt.objects() + objects := []client.Object{machine} + for _, node := range nodes { + objects = append(objects, &node) + } + + // Execute test + got, found := findMatchingNode(*machine, nodes) + + // Validate results + if found != tt.want.found { + t.Errorf("unexpected test result: expected found = %t, got = %t", tt.want.found, found) + } + if got.Name != tt.want.name { + t.Errorf("unexpected test result: expected node named = %#v, got = %#v", tt.want.name, got) + } + }) + } +} + +func Test_GetNodeForMachine(t *testing.T) { + type result struct { + err bool + name string + } + tests := []struct { + name string + objects func() (*machinev1beta1.Machine, []*corev1.Node) + want result + }{ + { + name: "error when machine missing nodeRef", + objects: func() (*machinev1beta1.Machine, []*corev1.Node) { + machine := newWorkerMachine("missing-noderef") + machine.Status = machinev1beta1.MachineStatus{} + return machine, []*corev1.Node{} + }, + want: result{ + err: true, + }, + }, + { + name: "error when node does not exist", + objects: func() (*machinev1beta1.Machine, []*corev1.Node) { + machine := newWorkerMachine("invalid-noderef") + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "missing-node", + } + + node1 := newWorkerNode("node1") + node2 := newWorkerNode("node2") + return machine, []*corev1.Node{node1, node2} + }, + want: result{ + err: true, + }, + }, + { + name: "return node when exists", + objects: func() (*machinev1beta1.Machine, []*corev1.Node) { + machine := newWorkerMachine("valid") + node1 := newWorkerNode("node1") + node2 := newWorkerNode("node2") + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: node1.Name, + } + return machine, []*corev1.Node{node1, node2} + }, + want: result{ + err: false, + name: "node1", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Setup test + machine, nodes := tt.objects() + objs := []client.Object{machine} + for _, node := range nodes { + objs = append(objs, node) + } + + kclient, err := newFakeClient(objs...) + if err != nil { + t.Errorf("failed to create fake client for testing: %v", err) + } + + // Execute + got, err := GetNodeForMachine(context.TODO(), kclient, *machine) + + // Validate results + if (err != nil) != tt.want.err { + t.Errorf("unexpected result in Investigation.getNodeForMachine(): wanted error = %t, returned err = %v", tt.want.err, err) + } + if got.Name != tt.want.name { + t.Errorf("incorrect node returned: expected %q, got %q", tt.want.name, got.Name) + } + }) + } +} + +func newFailingMachineHealthCheck() *machinev1beta1.MachineHealthCheck { + mhc := &machinev1beta1.MachineHealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-mhc", + Namespace: MachineNamespace, + }, + Spec: machinev1beta1.MachineHealthCheckSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{"mhc-name": "test-mhc"}, + }, + }, + Status: machinev1beta1.MachineHealthCheckStatus{ + Conditions: []machinev1beta1.Condition{ + { + Type: machinev1beta1.RemediationAllowedCondition, + Status: corev1.ConditionFalse, + }, + }, + }, + } + return mhc +} + +func newWorkerMachine(name string) *machinev1beta1.Machine { + m := &machinev1beta1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: MachineNamespace, + Labels: map[string]string{"mhc-name": "test-mhc", RoleLabelKey: WorkerRoleLabelValue}, + }, + } + return m +} + +func newWorkerNode(name string) *corev1.Node { + n := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{"node-role/kubernetes.io/worker": ""}, + }, + } + return n +} + +func newFakeClient(objs ...client.Object) (client.Client, error) { + s := scheme.Scheme + err := machinev1beta1.AddToScheme(s) + if err != nil { + return nil, err + } + + client := fake.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build() + return client, nil +} diff --git a/pkg/investigations/utils/node/node.go b/pkg/investigations/utils/node/node.go new file mode 100644 index 00000000..f38f70f4 --- /dev/null +++ b/pkg/investigations/utils/node/node.go @@ -0,0 +1,57 @@ +/* +node defines investigation utility logic related to node objects +*/ +package node + +import ( + "context" + "strings" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + RoleLabelPrefix = "node-role.kubernetes.io" + WorkerRoleSuffix = "worker" +) + +// FindNoScheduleTaint searches the node's taints to find one with effect: NoSchedule, if present. +// +// If none is present, an empty taint and 'false' are returned +func FindNoScheduleTaint(node corev1.Node) (corev1.Taint, bool) { + for _, taint := range node.Spec.Taints { + if taint.Effect == corev1.TaintEffectNoSchedule { + return taint, true + } + } + return corev1.Taint{}, false +} + +// GetNodes retrieves all nodes present in the cluster +func GetAll(ctx context.Context, kclient client.Client) ([]corev1.Node, error) { + nodes := corev1.NodeList{} + err := kclient.List(ctx, &nodes) + return nodes.Items, err +} + +// FindReadyCondition searches a node's .Status for the NodeReady condition, and returns it alongside a boolean value which +// indicates whether the condition was found or not +func FindReadyCondition(node corev1.Node) (corev1.NodeCondition, bool) { + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady { + return condition, true + } + } + return corev1.NodeCondition{}, false +} + +// GetNodeRole returns the role of the provided node +func GetRole(node corev1.Node) (string, bool) { + for label := range node.Labels { + if strings.Contains(label, RoleLabelPrefix) { + return label, true + } + } + return "", false +} diff --git a/pkg/investigations/utils/node/node_test.go b/pkg/investigations/utils/node/node_test.go new file mode 100644 index 00000000..7868c90e --- /dev/null +++ b/pkg/investigations/utils/node/node_test.go @@ -0,0 +1,204 @@ +package node + +import ( + "fmt" + "reflect" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func Test_FindNoScheduleTaint(t *testing.T) { + type result struct { + found bool + key string + } + + tests := []struct { + name string + node func() *corev1.Node + want result + }{ + { + name: "false when taint missing", + node: func() *corev1.Node { + node := newWorkerNode("taintless") + node.Spec.Taints = []corev1.Taint{} + return node + }, + want: result{ + found: false, + }, + }, + { + name: "return taint when it exists", + node: func() *corev1.Node { + node := newWorkerNode("tainted") + node.Spec.Taints = []corev1.Taint{ + { + Key: "tainted", + Effect: corev1.TaintEffectNoSchedule, + }, + } + return node + }, + want: result{ + key: "tainted", + found: true, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := tt.node() + + got, found := FindNoScheduleTaint(*node) + if found != tt.want.found { + t.Errorf("unexpected test result: expected found = %t, got = %t", tt.want.found, found) + } + if tt.want.found { + if got.Effect != corev1.TaintEffectNoSchedule { + t.Errorf("unexpected test result: expected taint effect = %q, got = %q", corev1.TaintEffectNoSchedule, got.Effect) + } + if got.Key != tt.want.key { + t.Errorf("unexpected test result: expected taint with key = %q, got = %q", tt.want.key, got.Key) + } + } + }) + } +} + +func Test_FindNodeReadyCondition(t *testing.T) { + readyCondition := corev1.NodeCondition{ + Type: corev1.NodeReady, + LastTransitionTime: metav1.Time{ + Time: time.Now(), + }, + } + + type result struct { + found bool + condition corev1.NodeCondition + } + + tests := []struct { + name string + node func() *corev1.Node + want result + }{ + { + name: "return false when missing condition", + node: func() *corev1.Node { + node := newWorkerNode("not-ready") + node.Status.Conditions = []corev1.NodeCondition{} + return node + }, + want: result{ + found: false, + }, + }, + { + name: "return true when Ready", + node: func() *corev1.Node { + node := newWorkerNode("ready") + node.Status.Conditions = []corev1.NodeCondition{readyCondition} + return node + }, + want: result{ + found: true, + condition: readyCondition, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := tt.node() + got, found := FindReadyCondition(*node) + + if found != tt.want.found { + t.Errorf("unexpected test result: expected found = %t, got = %t", tt.want.found, found) + } + if !reflect.DeepEqual(got, tt.want.condition) { + t.Errorf("unexpected test result: expected condition = %#v, got = %#v", tt.want.condition, got) + } + }) + } +} + +func Test_GetNodeRole(t *testing.T) { + var ( + workerLabel = fmt.Sprintf("%s/%s", RoleLabelPrefix, WorkerRoleSuffix) + infraLabel = fmt.Sprintf("%s/infra", RoleLabelPrefix) + ) + + type result struct { + label string + found bool + } + tests := []struct { + name string + node func() *corev1.Node + want result + }{ + { + name: "not found when no role label", + node: func() *corev1.Node { + node := newWorkerNode("labelless") + node.Labels = map[string]string{} + return node + }, + want: result{ + found: false, + }, + }, + { + name: "label returned when found on worker", + node: func() *corev1.Node { + node := newWorkerNode("labelled") + node.Labels = map[string]string{workerLabel: ""} + return node + }, + want: result{ + found: true, + label: workerLabel, + }, + }, + { + name: "label returned when found on non-worker", + node: func() *corev1.Node { + node := newWorkerNode("labelled") + node.Labels = map[string]string{infraLabel: ""} + return node + }, + want: result{ + found: true, + label: infraLabel, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := tt.node() + got, found := GetRole(*node) + if found != tt.want.found { + t.Errorf("unexpected test result: expected found = %t, got = %t", tt.want.found, found) + } + if got != tt.want.label { + t.Errorf("unexpected test result: expected label = %q, got = %q", tt.want.label, got) + } + }) + } +} + +func newWorkerNode(name string) *corev1.Node { + n := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{fmt.Sprintf("%s/%s", RoleLabelPrefix, WorkerRoleSuffix): ""}, + }, + } + return n +} diff --git a/pkg/k8s/scheme.go b/pkg/k8s/scheme.go index 6c3ef57f..4263d409 100644 --- a/pkg/k8s/scheme.go +++ b/pkg/k8s/scheme.go @@ -4,6 +4,7 @@ import ( "fmt" configv1 "github.com/openshift/api/config/v1" + machinev1beta1 "github.com/openshift/api/machine/v1beta1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" ) @@ -20,5 +21,9 @@ func initScheme() (*runtime.Scheme, error) { return nil, fmt.Errorf("unable to add config.openshift.io/v1 scheme: %w", err) } + if err := machinev1beta1.AddToScheme(scheme); err != nil { + return nil, fmt.Errorf("unable to add machine.openshift.io/v1beta1 scheme: %w", err) + } + return scheme, nil } diff --git a/test/generate_incident.sh b/test/generate_incident.sh index 53da375d..85bab87c 100755 --- a/test/generate_incident.sh +++ b/test/generate_incident.sh @@ -8,6 +8,7 @@ declare -A alert_mapping=( ["ClusterProvisioningDelay"]="ClusterProvisioningDelay -" ["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)" ["InsightsOperatorDown"]="InsightsOperatorDown" + ["MachineHealthCheckUnterminatedShortCircuitSRE"]="MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)" ) # Function to print help message