Skip to content

OSD-28525 - Initial implementation for MachineHealthCheckUnterminatedShortCircuitSRE investigation #395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ require (
github.com/prometheus/client_golang v1.22.0
github.com/prometheus/common v0.63.0
github.com/spf13/cobra v1.8.1
github.com/stretchr/testify v1.10.0
go.uber.org/mock v0.4.0
go.uber.org/zap v1.27.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.31.3
k8s.io/apimachinery v0.31.3
k8s.io/client-go v0.31.3
sigs.k8s.io/controller-runtime v0.19.0
)

Expand Down Expand Up @@ -123,7 +123,6 @@ require (
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/sagikazarmark/locafero v0.7.0 // indirect
Expand Down Expand Up @@ -155,7 +154,6 @@ require (
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cli-runtime v0.30.3 // indirect
k8s.io/client-go v0.31.3 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,329 @@
/*
machinehealthcheckunterminatedshortcircuitsre defines the investigation logic for the MachineHealthCheckUnterminatedShortCircuitSRE alert
*/
package machinehealthcheckunterminatedshortcircuitsre

import (
"context"
"errors"
"fmt"
"slices"
"strings"
"time"

machinev1beta1 "github.com/openshift/api/machine/v1beta1"

"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine"
nodeutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/node"
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"

corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
alertname = "MachineHealthCheckUnterminatedShortCircuitSRE"
// remediationName must match the name of this investigation's directory, so it can be looked up via the backplane-api
remediationName = "machineHealthCheckUnterminatedShortCircuitSRE"
)

type Investigation struct {
// kclient provides access to on-cluster resources
kclient client.Client
// notes holds the messages that will be shared with Primary upon completion
notes *notewriter.NoteWriter
// recommendations holds the set of actions CAD recommends primary to take
recommendations investigationRecommendations
}

func (i *Investigation) setup(r *investigation.Resources) error {
// Setup investigation
k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
if err != nil {
return fmt.Errorf("failed to initialize kubernetes client: %w", err)
}
i.kclient = k
i.notes = notewriter.New(r.Name, logging.RawLogger)
i.recommendations = investigationRecommendations{}

return nil
}

// Run investigates the MachineHealthCheckUnterminatedShortCircuitSRE alert
//
// The investigation seeks to provide exactly one recommended action per affected machine/node pair.
// The machine object is evaluated first, as it represents a lower-level object that could affect the health of the node.
// If the investigation determines that the breakage is occurring at the machine-level, the corresponding node is *not* investigated.
// After investigating all affected machines, potentially affected nodes are investigated.
func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
ctx := context.Background()
result := investigation.InvestigationResult{}

// Setup & teardown
err := i.setup(r)
if err != nil {
return result, fmt.Errorf("failed to setup investigation: %w", err)
}
defer func(r *investigation.Resources) {
err := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
if err != nil {
logging.Errorf("failed to cleanup investigation: %w", err)
}
}(r)

targetMachines, err := i.getMachinesFromFailingMHC(ctx)
if err != nil {
i.notes.AppendWarning("failed to retrieve one or more target machines: %v", err)
}
if len(targetMachines) == 0 {
i.notes.AppendWarning("no machines found for short-circuited machinehealthcheck objects")
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
}

problemMachines, err := i.InvestigateMachines(ctx, targetMachines)
if err != nil {
i.notes.AppendWarning("failed to investigate machines: %v", err)
}

// Trim out the machines that we've already investigated and know have problems
//
// The purpose of this is to avoid re-investigating nodes whose machines were already investigated. Any node-level issue on a failing machine
// is most likely related to the machine itself, and providing duplicate/conflicting advice will only prove confusing to the responder
targetMachines = slices.DeleteFunc(targetMachines, func(targetMachine machinev1beta1.Machine) bool {
for _, problemMachine := range problemMachines {
if problemMachine.Name == targetMachine.Name {
return true
}
}
return false
})

// If one or more machines managed by the machinehealthcheck have not yet been identified as a problem, check on the machine's
// node to determine if there are node-level problems that need remediating
if len(targetMachines) > 0 {
targetNodes, err := machineutil.GetNodesForMachines(ctx, i.kclient, targetMachines)
if err != nil {
i.notes.AppendWarning("failed to retrieve one or more target nodes: %v", err)
}
if len(targetNodes) > 0 {
i.InvestigateNodes(targetNodes)
}
}

// Summarize recommendations from investigation in PD notes, if any found
if len(i.recommendations) > 0 {
i.notes.AppendWarning(i.recommendations.summarize())
} else {
i.notes.AppendSuccess("no recommended actions to take against cluster")
}

return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
}

func (i *Investigation) getMachinesFromFailingMHC(ctx context.Context) ([]machinev1beta1.Machine, error) {
healthchecks := machinev1beta1.MachineHealthCheckList{}
err := i.kclient.List(ctx, &healthchecks, &client.ListOptions{Namespace: machineutil.MachineNamespace})
if err != nil {
return []machinev1beta1.Machine{}, fmt.Errorf("failed to retrieve machinehealthchecks from cluster: %w", err)
}

targets := []machinev1beta1.Machine{}
for _, healthcheck := range healthchecks.Items {
if !machineutil.HealthcheckRemediationAllowed(healthcheck) {
machines, err := machineutil.GetMachinesForMHC(ctx, i.kclient, healthcheck)
if err != nil {
i.notes.AppendWarning("failed to retrieve machines from machinehealthcheck %q: %v", healthcheck.Name, err)
continue
}
targets = append(targets, machines...)
}
}

return targets, nil
}

// InvestigateMachines evaluates the state of the machines in the cluster and returns a list of the failing machines, along with a categorized set of recommendations based on the failure state of
// each machine
func (i *Investigation) InvestigateMachines(ctx context.Context, targets []machinev1beta1.Machine) ([]machinev1beta1.Machine, error) {
investigatedMachines := []machinev1beta1.Machine{}
var errs error
for _, machine := range targets {
if machine.DeletionTimestamp != nil {
err := i.investigateDeletingMachine(ctx, machine)
investigatedMachines = append(investigatedMachines, machine)
if err != nil {
errs = errors.Join(errs, fmt.Errorf("failed to investigate deleting machine: %w", err))
}
continue
}

if (machine.Status.Phase != nil && *machine.Status.Phase == machinev1beta1.PhaseFailed) || machine.Status.ErrorReason != nil {
err := i.investigateFailingMachine(machine)
investigatedMachines = append(investigatedMachines, machine)
if err != nil {
errs = errors.Join(errs, fmt.Errorf("failed to investigate failing machine: %w", err))
}
}
}

if len(investigatedMachines) == 0 {
i.notes.AppendSuccess("no deleting or Failed machines found")
}
return investigatedMachines, errs
}

// investigateFailingMachin evaluates a machine whose .Status.Phase is failed, and provides a recommendation based on the cause of the failure
func (i *Investigation) investigateFailingMachine(machine machinev1beta1.Machine) error {
role, err := machineutil.GetRole(machine)
if err != nil {
// Failing to determine whether a machine is Red Hat-managed warrants manual investigation
notes := fmt.Sprintf("unable to determine machine role: %v", err)
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
return fmt.Errorf("failed to determine role for machine %q: %w", machine.Name, err)
}

// Safely dereference status fields to avoid panics in cases where machines' statuses haven't been fully populated
var errorMsg string
if machine.Status.ErrorMessage != nil {
errorMsg = *machine.Status.ErrorMessage
}

var errorReason machinev1beta1.MachineStatusError
if machine.Status.ErrorReason != nil {
errorReason = *machine.Status.ErrorReason
}

if role != machineutil.WorkerRoleLabelValue {
// If machine is Red-Hat owned, always require manual investigation
notes := fmt.Sprintf("Red Hat-owned machine in state %q due to %q", errorReason, errorMsg)
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
return nil
}

switch errorReason {
case machinev1beta1.IPAddressInvalidReason:
notes := fmt.Sprintf("invalid IP address: %q. Deleting machine may allow the cloud provider to assign a valid IP address", errorMsg)
i.recommendations.addRecommendation(recommendationDeleteMachine, machine.Name, notes)

case machinev1beta1.CreateMachineError:
notes := fmt.Sprintf("machine failed to create: %q. Deleting machine may resolve any transient issues with the cloud provider", errorMsg)
i.recommendations.addRecommendation(recommendationDeleteMachine, machine.Name, notes)

case machinev1beta1.InvalidConfigurationMachineError:
notes := fmt.Sprintf("the machine configuration is invalid: %q. Checking splunk audit logs may indicate whether the customer has modified the machine or its machineset", errorMsg)
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)

case machinev1beta1.DeleteMachineError:
notes := fmt.Sprintf("the machine's node could not be gracefully terminated automatically: %q", errorMsg)
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)

case machinev1beta1.InsufficientResourcesMachineError:
notes := fmt.Sprintf("a servicelog should be sent because there is insufficient quota to provision the machine: %q", errorMsg)
i.recommendations.addRecommendation(recommendationQuotaServiceLog, machine.Name, notes)

default:
notes := "no .Status.ErrorReason found for machine"
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
}
return nil
}

// InvestigateDeletingMachines evaluates machines which are being deleted, to determine if & why they are blocked, along with a recommendation on how to unblock them
func (i *Investigation) investigateDeletingMachine(ctx context.Context, machine machinev1beta1.Machine) error {
if machine.Status.NodeRef == nil {
notes := "machine stuck deleting with no node"
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
return nil
}
node, err := machineutil.GetNodeForMachine(ctx, i.kclient, machine)
if err != nil {
notes := "machine's node could not be determined"
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
return fmt.Errorf("failed to retrieve node for machine %q: %w", machine.Name, err)
}

stuck, duration := checkForStuckDrain(node)
if stuck {
notes := fmt.Sprintf("node %q found to be stuck draining for %s", node.Name, duration.Truncate(time.Second).String())
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
return nil
}

notes := "unable to determine why machine is stuck deleting"
i.recommendations.addRecommendation(recommendationInvestigateMachine, machine.Name, notes)
return nil
}

// checkForStuckDrain makes a best-effort approximation at whether a node is stuck draining a specific pod
func checkForStuckDrain(node corev1.Node) (bool, *time.Duration) {
if len(node.Spec.Taints) == 0 {
return false, nil
}

taint, found := nodeutil.FindNoScheduleTaint(node)
if !found {
return false, nil
}

// TODO - Once CAD can access on-cluster metrics, we can query the `pods_preventing_node_drain` metric from prometheus
// to more accurately gauge if a node is truly stuck deleting, and what pod is causing it
drainDuration := time.Since(taint.TimeAdded.Time)
if drainDuration > 10*time.Minute {
return true, &drainDuration
}

return false, nil
}

// InvestigateNodes examines the provided nodes and returns recommended actions, if any are needed
func (i *Investigation) InvestigateNodes(nodes []corev1.Node) {
for _, node := range nodes {
i.InvestigateNode(node)
}
}

// InvestigateNode examines a node and determines if further investigation is required
func (i *Investigation) InvestigateNode(node corev1.Node) {
roleLabel, found := nodeutil.GetRole(node)
if !found {
notes := fmt.Sprintf("no role label containing %q found for node", nodeutil.RoleLabelPrefix)
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
return
} else if !strings.Contains(roleLabel, nodeutil.WorkerRoleSuffix) {
notes := "non-worker node affected"
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
return
}

ready, found := nodeutil.FindReadyCondition(node)
if !found {
notes := "found no 'Ready' .Status.Condition for the node"
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
return
}

if ready.Status != corev1.ConditionTrue {
lastCheckinElapsed := time.Since(ready.LastHeartbeatTime.Time).Truncate(time.Second)
notes := fmt.Sprintf("node has been %q for %s", ready.Status, lastCheckinElapsed)
i.recommendations.addRecommendation(recommendationInvestigateNode, node.Name, notes)
}
}

func (i *Investigation) Name() string {
return alertname
}

func (i *Investigation) Description() string {
return fmt.Sprintf("Investigates '%s' alerts", alertname)
}

func (i *Investigation) IsExperimental() bool {
return true
}

func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
return strings.Contains(alert, alertname)
}
Loading