diff --git a/pkg/investigations/cannotretrieveupdatesre/README.md b/pkg/investigations/cannotretrieveupdatesre/README.md new file mode 100644 index 00000000..d6fada18 --- /dev/null +++ b/pkg/investigations/cannotretrieveupdatesre/README.md @@ -0,0 +1,13 @@ +# cannotretrieveupdatessre Investigation + +Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and posting some cluster version errors. + +## Investigation Logic + +The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks: +1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints. +2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`. + +## Testing + +Refer to the [testing README](./testing/README.md) for instructions on testing this investigation diff --git a/pkg/investigations/cannotretrieveupdatesre/cannotRetrieveUpdateSRE.go b/pkg/investigations/cannotretrieveupdatesre/cannotRetrieveUpdateSRE.go new file mode 100644 index 00000000..c5add6ea --- /dev/null +++ b/pkg/investigations/cannotretrieveupdatesre/cannotRetrieveUpdateSRE.go @@ -0,0 +1,169 @@ +package cannotretrieveupdatesre + +import ( + "context" + "errors" + "fmt" + "strings" + + configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" + k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" + "github.com/openshift/configuration-anomaly-detection/pkg/logging" + "github.com/openshift/configuration-anomaly-detection/pkg/networkverifier" + "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" + "k8s.io/apimachinery/pkg/fields" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + alertname = "CannotRetrieveUpdatesSRE" + remediationName = "cannotretrieveupdatesre" +) + +type Investigation struct { + kclient client.Client + notes *notewriter.NoteWriter +} + +// Run executes the investigation for the CannotRetrieveUpdatesSRE alert +func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { + result := investigation.InvestigationResult{} + + // Setup + err := i.setup(r) + if err != nil { + return result, fmt.Errorf("failed to setup investigation: %w", err) + } + + defer func(r *investigation.Resources) { + logging.Infof("Cleaning up investigation resources for cluster %s", r.Cluster.ID()) + if cleanupErr := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName); cleanupErr != nil { + logging.Errorf("Failed to cleanup Kubernetes client: %v", cleanupErr) + } else { + logging.Infof("Cleanup completed successfully for cluster %s", r.Cluster.ID()) + } + }(r) + + if err := i.checkClusterValidity(r); err != nil { + logging.Errorf("Cluster validation failed: %v", err) + return result, r.PdClient.EscalateIncidentWithNote(i.notes.String()) + } + + if err := i.runNetworkVerifier(r, &result); err != nil { + logging.Errorf("Network verification failed: %v", err) + } + + if err := i.checkClusterVersion(r); err != nil { + logging.Errorf("ClusterVersion check failed: %v", err) + } + + i.notes.AppendWarning("Alert escalated to on-call primary for review.") + logging.Infof("Escalating incident with notes for cluster %s", r.Cluster.ID()) + err = r.PdClient.EscalateIncidentWithNote(i.notes.String()) + if err != nil { + logging.Errorf("Failed to escalate incident to PagerDuty: %v", err) + return result, fmt.Errorf("failed to escalate incident: %w", err) + } + logging.Infof("Investigation completed and escalated successfully for cluster %s", r.Cluster.ID()) + + return result, nil +} + +func (i *Investigation) checkClusterValidity(r *investigation.Resources) error { + if r.Cluster == nil || r.Cluster.ID() == "" { + errMsg := "invalid cluster configuration: cluster or cluster ID is missing" + i.notes.AppendWarning(errMsg) + return errors.New(errMsg) + } + return nil +} + +func (i *Investigation) runNetworkVerifier(r *investigation.Resources, result *investigation.InvestigationResult) error { + logging.Infof("Running network verification for cluster %s", r.Cluster.ID()) + verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient) + if err != nil { + i.notes.AppendWarning("Network verifier encountered an error: %v", err) + return fmt.Errorf("network verifier failed: %w", err) + } + + logging.Infof("Network verification completed with result: %v", verifierResult) + switch verifierResult { + case networkverifier.Success: + i.notes.AppendSuccess("Network verifier passed") + case networkverifier.Failure: + logging.Infof("Network verifier reported failure: %s", failureReason) + result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil} + i.notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", + r.Cluster.ID(), failureReason) + return errors.New("network verification failed: " + failureReason) + } + return nil +} + +func (i *Investigation) checkClusterVersion(r *investigation.Resources) error { + logging.Infof("Checking ClusterVersion for cluster %s", r.Cluster.ID()) + cvList := &configv1.ClusterVersionList{} + listOptions := &client.ListOptions{FieldSelector: fields.SelectorFromSet(fields.Set{"metadata.name": "version"})} + err := i.kclient.List(context.TODO(), cvList, listOptions) + if err != nil { + i.notes.AppendWarning("Failed to list ClusterVersion: %v\nThis may indicate cluster access issues", err) + return fmt.Errorf("failed to list ClusterVersion: %w", err) + } + if len(cvList.Items) != 1 { + errMsg := fmt.Sprintf("found %d ClusterVersions, expected 1", len(cvList.Items)) + logging.Warnf(errMsg) + i.notes.AppendWarning(errMsg) + return errors.New(errMsg) + } + + versionCv := cvList.Items[0] + logging.Infof("ClusterVersion found: %s", versionCv.Status.Desired.Version) + for _, condition := range versionCv.Status.Conditions { + logging.Debugf("Checking ClusterVersion condition: Type=%s, Status=%s, Reason=%s, Message=%s", + condition.Type, condition.Status, condition.Reason, condition.Message) + if condition.Type == "RetrievedUpdates" && + condition.Status == "False" && + condition.Reason == "VersionNotFound" && + strings.Contains(condition.Message, "Unable to retrieve available updates") { + i.notes.AppendWarning("ClusterVersion error detected: %s\nThis indicates the current version %s is not found in the specified channel %s", + condition.Message, versionCv.Status.Desired.Version, versionCv.Spec.Channel) + return errors.New("clusterversion validation failed: VersionNotFound") + } + } + fmt.Printf("Cluster version: %s\n", versionCv.Status.Desired.Version) + return nil +} + +// setup initializes the investigation resources +func (i *Investigation) setup(r *investigation.Resources) error { + logging.Infof("Setting up investigation '%s' for cluster %s with remediation name %s", + i.Name(), r.Cluster.ID(), r.Name) + + k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName) + if err != nil { + logging.Errorf("Failed to initialize Kubernetes client: %v", err) + return fmt.Errorf("failed to initialize kubernetes client: %w", err) + } + i.kclient = k + i.notes = notewriter.New(r.Name, logging.RawLogger) + + logging.Infof("Successfully set up Kubernetes client and notewriter for remediation %s", r.Name) + return nil +} + +func (i *Investigation) Name() string { + return alertname +} + +func (i *Investigation) Description() string { + return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname) +} + +func (i *Investigation) ShouldInvestigateAlert(alert string) bool { + return strings.Contains(alert, alertname) +} + +func (i *Investigation) IsExperimental() bool { + return true +} diff --git a/pkg/investigations/cannotretrieveupdatesre/cannotRetrieveUpdateSRE_test.go b/pkg/investigations/cannotretrieveupdatesre/cannotRetrieveUpdateSRE_test.go new file mode 100644 index 00000000..fe235b91 --- /dev/null +++ b/pkg/investigations/cannotretrieveupdatesre/cannotRetrieveUpdateSRE_test.go @@ -0,0 +1,161 @@ +package cannotretrieveupdatesre + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" + configv1 "github.com/openshift/api/config/v1" + awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock" + investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" + "github.com/openshift/configuration-anomaly-detection/pkg/logging" + pdmock "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty/mock" + hivev1 "github.com/openshift/hive/apis/hive/v1" + "go.uber.org/mock/gomock" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +var _ = Describe("CannotRetrieveUpdatesSRE Investigation", func() { + var ( + mockCtrl *gomock.Controller + clusterBuilder *cmv1.ClusterBuilder + cluster *cmv1.Cluster + clusterDeployment *hivev1.ClusterDeployment + pdClient *pdmock.MockClient + awsCli *awsmock.MockClient + fakeClient client.Client + scheme *runtime.Scheme + inv *Investigation + resources *investigation.Resources + ) + + BeforeEach(func() { + logging.InitLogger("fatal", "") + + mockCtrl = gomock.NewController(GinkgoT()) + pdClient = pdmock.NewMockClient(mockCtrl) + awsCli = awsmock.NewMockClient(mockCtrl) + + clusterBuilder = cmv1.NewCluster().ID("test-cluster") + var err error + cluster, err = clusterBuilder.Build() + Expect(err).ToNot(HaveOccurred()) + + clusterDeployment = &hivev1.ClusterDeployment{ + Spec: hivev1.ClusterDeploymentSpec{ + ClusterMetadata: &hivev1.ClusterMetadata{ + InfraID: "infra_id", + }, + }, + } + + scheme = runtime.NewScheme() + Expect(configv1.AddToScheme(scheme)).To(Succeed()) + fakeClient = fake.NewClientBuilder().WithScheme(scheme).Build() + + inv = &Investigation{ + kclient: fakeClient, + } + resources = &investigation.Resources{ + Cluster: cluster, + ClusterDeployment: clusterDeployment, + PdClient: pdClient, + AwsClient: awsCli, + Name: remediationName, + } + }) + + AfterEach(func() { + mockCtrl.Finish() + }) + + Describe("Run Method", func() { + When("ClusterVersion has VersionNotFound condition", func() { + It("Should detect the condition and escalate with appropriate notes", func() { + cv := &configv1.ClusterVersion{ + ObjectMeta: v1.ObjectMeta{Name: "version"}, + Spec: configv1.ClusterVersionSpec{Channel: "stable-4.18"}, + Status: configv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.18.5"}, + Conditions: []configv1.ClusterOperatorStatusCondition{ + { + Type: "RetrievedUpdates", + Status: "False", + Reason: "VersionNotFound", + Message: "Unable to retrieve available updates: version 4.18.5 not found", + }, + }, + }, + } + fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithObjects(cv).Build() + inv.kclient = fakeClient + + // Arrange + awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil) + awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil) + pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error { + Expect(note).To(ContainSubstring("Network verifier passed")) + Expect(note).To(ContainSubstring("ClusterVersion error detected: Unable to retrieve available updates: version 4.18.5 not found")) + Expect(note).To(ContainSubstring("This indicates the current version 4.18.5 is not found in the specified channel stable-4.18")) + Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review")) + return nil + }) + + // Act + result, err := inv.Run(resources) + + // Assert + Expect(err).ToNot(HaveOccurred()) + Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) + }) + }) + + When("Network verifier fails", func() { + It("Should prepare a service log and escalate", func() { + // Arrange + awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil) + awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil) + pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error { + Expect(note).To(ContainSubstring("NetworkVerifier found unreachable targets")) + Expect(note).To(ContainSubstring("osdctl servicelog post test-cluster")) + Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review")) + return nil + }) + + // Act + result, err := inv.Run(resources) + + // Assert + Expect(err).ToNot(HaveOccurred()) + Expect(result.ServiceLogPrepared.Performed).To(BeTrue()) + }) + }) + + When("Kubernetes client fails to list ClusterVersion", func() { + It("Should escalate with a warning note", func() { + fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects().Build() + inv.kclient = fakeClient + + // Arrange + awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil) + awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil) + pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error { + Expect(note).To(ContainSubstring("Network verifier passed")) + Expect(note).To(ContainSubstring("Failed to list ClusterVersion")) + Expect(note).To(ContainSubstring("This may indicate cluster access issues")) + Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review")) + return nil + }) + + // Act + result, err := inv.Run(resources) + + // Assert + Expect(err).ToNot(HaveOccurred()) + Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) + }) + }) + }) +}) diff --git a/pkg/investigations/CannotRetrieveUpdatesSRE/metadata.yaml b/pkg/investigations/cannotretrieveupdatesre/metadata.yaml similarity index 100% rename from pkg/investigations/CannotRetrieveUpdatesSRE/metadata.yaml rename to pkg/investigations/cannotretrieveupdatesre/metadata.yaml diff --git a/pkg/investigations/cannotretrieveupdatesre/testing/README.md b/pkg/investigations/cannotretrieveupdatesre/testing/README.md new file mode 100644 index 00000000..bf26fffb --- /dev/null +++ b/pkg/investigations/cannotretrieveupdatesre/testing/README.md @@ -0,0 +1,5 @@ +# Testing CannotRetrieveUpdatesSRE Investigation + +TODO: +- Add a test script or test objects to this directory for future maintainers to use +- Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc) diff --git a/pkg/investigations/registry.go b/pkg/investigations/registry.go index f69a027c..9a6aa1b9 100644 --- a/pkg/investigations/registry.go +++ b/pkg/investigations/registry.go @@ -1,6 +1,7 @@ package investigations import ( + cannotretrieveupdatesre "github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatesre" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn" @@ -16,6 +17,7 @@ var availableInvestigations = []investigation.Investigation{ &clustermonitoringerrorbudgetburn.Investigation{}, &cpd.Investigation{}, &insightsoperatordown.Investigation{}, + &cannotretrieveupdatesre.Investigation{}, } // GetInvestigation returns the first Investigation that applies to the given alert title. diff --git a/test/generate_incident.sh b/test/generate_incident.sh index 53da375d..48a469a8 100755 --- a/test/generate_incident.sh +++ b/test/generate_incident.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -e # Define the mapping of alert names to titles @@ -8,6 +8,7 @@ declare -A alert_mapping=( ["ClusterProvisioningDelay"]="ClusterProvisioningDelay -" ["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)" ["InsightsOperatorDown"]="InsightsOperatorDown" + ["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE" ) # Function to print help message