Skip to content

Commit f246d1f

Browse files
committed
Initial implementation for CannotRetrieveUpdatesSRE
1 parent 53d07dc commit f246d1f

File tree

6 files changed

+340
-1
lines changed

6 files changed

+340
-1
lines changed
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
package CannotRetrieveUpdatesSRE
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
8+
configv1 "github.com/openshift/api/config/v1"
9+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
10+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
11+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
14+
"k8s.io/apimachinery/pkg/fields"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
)
17+
18+
const (
19+
alertname = "CannotRetrieveUpdatesSRE"
20+
remediationName = "CannotRetrieveUpdatesSRE"
21+
)
22+
23+
type Investigation struct {
24+
kclient client.Client
25+
notes *notewriter.NoteWriter
26+
}
27+
28+
// setup initializes the investigation resources
29+
func (i *Investigation) setup(r *investigation.Resources) error {
30+
logging.Infof("Setting up investigation '%s' for cluster %s with remediation name %s",
31+
i.Name(), r.Cluster.ID(), r.Name)
32+
33+
k, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
34+
if err != nil {
35+
logging.Errorf("Failed to initialize Kubernetes client: %v", err)
36+
return fmt.Errorf("failed to initialize kubernetes client: %w", err)
37+
}
38+
i.kclient = k
39+
i.notes = notewriter.New(r.Name, logging.RawLogger)
40+
41+
logging.Infof("Successfully set up Kubernetes client and notewriter for remediation %s", r.Name)
42+
return nil
43+
}
44+
45+
// cleanup handles resource cleanup after investigation
46+
func (i *Investigation) cleanup(r *investigation.Resources) error {
47+
logging.Infof("Cleaning up investigation resources for cluster %s", r.Cluster.ID())
48+
err := k8sclient.Cleanup(r.Cluster.ID(), r.OcmClient, remediationName)
49+
if err != nil {
50+
logging.Errorf("Failed to cleanup Kubernetes client: %v", err)
51+
return fmt.Errorf("failed to cleanup kubernetes client: %w", err)
52+
}
53+
logging.Infof("Cleanup completed successfully for cluster %s", r.Cluster.ID())
54+
return nil
55+
}
56+
57+
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
58+
func (i *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
59+
result := investigation.InvestigationResult{}
60+
61+
// Setup & teardown
62+
err := i.setup(r)
63+
if err != nil {
64+
return result, fmt.Errorf("failed to setup investigation: %w", err)
65+
}
66+
defer func(r *investigation.Resources) {
67+
if err := i.cleanup(r); err != nil {
68+
logging.Errorf("Failed to cleanup investigation: %v", err)
69+
}
70+
}(r)
71+
72+
if r.Cluster == nil || r.Cluster.ID() == "" {
73+
errMsg := "Invalid cluster configuration: cluster or cluster ID is missing"
74+
logging.Errorf(errMsg)
75+
i.notes.AppendWarning(errMsg)
76+
return result, r.PdClient.EscalateIncidentWithNote(i.notes.String())
77+
}
78+
79+
// Run network verification
80+
logging.Infof("Running network verification for cluster %s", r.Cluster.ID())
81+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
82+
if err != nil {
83+
logging.Errorf("Network verifier failed: %v", err)
84+
i.notes.AppendWarning("Network verifier encountered an error: %v", err)
85+
} else {
86+
logging.Infof("Network verification completed with result: %v", verifierResult)
87+
switch verifierResult {
88+
case networkverifier.Success:
89+
i.notes.AppendSuccess("Network verifier passed")
90+
case networkverifier.Failure:
91+
logging.Infof("Network verifier reported failure: %s", failureReason)
92+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
93+
i.notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s",
94+
r.Cluster.ID(), failureReason)
95+
}
96+
}
97+
98+
// Check ClusterVersion
99+
logging.Infof("Checking ClusterVersion for cluster %s", r.Cluster.ID())
100+
cvList := &configv1.ClusterVersionList{}
101+
listOptions := &client.ListOptions{FieldSelector: fields.SelectorFromSet(fields.Set{"metadata.name": "version"})}
102+
err = i.kclient.List(context.TODO(), cvList, listOptions)
103+
switch {
104+
case err != nil:
105+
logging.Errorf("Failed to list ClusterVersion: %v", err)
106+
i.notes.AppendWarning("Failed to list ClusterVersion: %v\nThis may indicate cluster access issues", err)
107+
case len(cvList.Items) != 1:
108+
logging.Warnf("Found %d ClusterVersions, expected 1", len(cvList.Items))
109+
i.notes.AppendWarning("Found %d ClusterVersions, expected 1", len(cvList.Items))
110+
default:
111+
versionCv := cvList.Items[0]
112+
logging.Infof("ClusterVersion found: %s", versionCv.Status.Desired.Version)
113+
for _, condition := range versionCv.Status.Conditions {
114+
logging.Debugf("Checking ClusterVersion condition: Type=%s, Status=%s, Reason=%s, Message=%s",
115+
condition.Type, condition.Status, condition.Reason, condition.Message)
116+
if condition.Type == "RetrievedUpdates" &&
117+
condition.Status == "False" &&
118+
condition.Reason == "VersionNotFound" &&
119+
strings.Contains(condition.Message, "Unable to retrieve available updates") {
120+
i.notes.AppendWarning("ClusterVersion error detected: %s\nThis indicates the current version %s is not found in the specified channel %s",
121+
condition.Message, versionCv.Status.Desired.Version, versionCv.Spec.Channel)
122+
}
123+
}
124+
fmt.Printf("Cluster version: %s\n", versionCv.Status.Desired.Version)
125+
}
126+
127+
i.notes.AppendWarning("Alert escalated to on-call primary for review.")
128+
logging.Infof("Escalating incident with notes for cluster %s", r.Cluster.ID())
129+
err = r.PdClient.EscalateIncidentWithNote(i.notes.String())
130+
if err != nil {
131+
logging.Errorf("Failed to escalate incident to PagerDuty: %v", err)
132+
return result, fmt.Errorf("failed to escalate incident: %w", err)
133+
}
134+
logging.Infof("Investigation completed and escalated successfully for cluster %s", r.Cluster.ID())
135+
136+
return result, nil
137+
}
138+
139+
func (i *Investigation) Name() string {
140+
return alertname
141+
}
142+
143+
func (i *Investigation) Description() string {
144+
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
145+
}
146+
147+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
148+
return strings.Contains(alert, alertname)
149+
}
150+
151+
func (i *Investigation) IsExperimental() bool {
152+
return true
153+
}
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
package CannotRetrieveUpdatesSRE
2+
3+
import (
4+
. "github.com/onsi/ginkgo/v2"
5+
. "github.com/onsi/gomega"
6+
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
7+
configv1 "github.com/openshift/api/config/v1"
8+
awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock"
9+
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
10+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
11+
pdmock "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty/mock"
12+
hivev1 "github.com/openshift/hive/apis/hive/v1"
13+
"go.uber.org/mock/gomock"
14+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/runtime"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
17+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
18+
)
19+
20+
var _ = Describe("CannotRetrieveUpdatesSRE Investigation", func() {
21+
var (
22+
mockCtrl *gomock.Controller
23+
clusterBuilder *cmv1.ClusterBuilder
24+
cluster *cmv1.Cluster
25+
clusterDeployment *hivev1.ClusterDeployment
26+
pdClient *pdmock.MockClient
27+
awsCli *awsmock.MockClient
28+
fakeClient client.Client
29+
scheme *runtime.Scheme
30+
inv *Investigation
31+
resources *investigation.Resources
32+
)
33+
34+
BeforeEach(func() {
35+
logging.InitLogger("fatal", "")
36+
37+
mockCtrl = gomock.NewController(GinkgoT())
38+
pdClient = pdmock.NewMockClient(mockCtrl)
39+
awsCli = awsmock.NewMockClient(mockCtrl)
40+
41+
// Setup cluster
42+
clusterBuilder = cmv1.NewCluster().ID("test-cluster")
43+
var err error
44+
cluster, err = clusterBuilder.Build()
45+
Expect(err).ToNot(HaveOccurred())
46+
47+
// Setup cluster deployment
48+
clusterDeployment = &hivev1.ClusterDeployment{
49+
Spec: hivev1.ClusterDeploymentSpec{
50+
ClusterMetadata: &hivev1.ClusterMetadata{
51+
InfraID: "infra_id",
52+
},
53+
},
54+
}
55+
56+
// Setup fake Kubernetes client
57+
scheme = runtime.NewScheme()
58+
Expect(configv1.AddToScheme(scheme)).To(Succeed())
59+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).Build()
60+
61+
inv = &Investigation{
62+
kclient: fakeClient,
63+
}
64+
resources = &investigation.Resources{
65+
Cluster: cluster,
66+
ClusterDeployment: clusterDeployment,
67+
PdClient: pdClient,
68+
AwsClient: awsCli,
69+
Name: remediationName,
70+
}
71+
})
72+
73+
AfterEach(func() {
74+
mockCtrl.Finish()
75+
})
76+
77+
Describe("Run Method", func() {
78+
When("ClusterVersion has VersionNotFound condition", func() {
79+
It("Should detect the condition and escalate with appropriate notes", func() {
80+
// Setup ClusterVersion with VersionNotFound
81+
cv := &configv1.ClusterVersion{
82+
ObjectMeta: v1.ObjectMeta{Name: "version"},
83+
Spec: configv1.ClusterVersionSpec{Channel: "stable-4.18"},
84+
Status: configv1.ClusterVersionStatus{
85+
Desired: configv1.Release{Version: "4.18.5"},
86+
Conditions: []configv1.ClusterOperatorStatusCondition{
87+
{
88+
Type: "RetrievedUpdates",
89+
Status: "False",
90+
Reason: "VersionNotFound",
91+
Message: "Unable to retrieve available updates: version 4.18.5 not found",
92+
},
93+
},
94+
},
95+
}
96+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithObjects(cv).Build()
97+
inv.kclient = fakeClient
98+
99+
// Arrange
100+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
101+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
102+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
103+
Expect(note).To(ContainSubstring("Network verifier passed"))
104+
Expect(note).To(ContainSubstring("ClusterVersion error detected: Unable to retrieve available updates: version 4.18.5 not found"))
105+
Expect(note).To(ContainSubstring("This indicates the current version 4.18.5 is not found in the specified channel stable-4.18"))
106+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
107+
return nil
108+
})
109+
110+
// Act
111+
result, err := inv.Run(resources)
112+
113+
// Assert
114+
Expect(err).ToNot(HaveOccurred())
115+
Expect(result.ServiceLogPrepared.Performed).To(BeFalse())
116+
})
117+
})
118+
119+
When("Network verifier fails", func() {
120+
It("Should prepare a service log and escalate", func() {
121+
// Arrange
122+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
123+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
124+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
125+
Expect(note).To(ContainSubstring("NetworkVerifier found unreachable targets"))
126+
Expect(note).To(ContainSubstring("osdctl servicelog post test-cluster"))
127+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
128+
return nil
129+
})
130+
131+
// Act
132+
result, err := inv.Run(resources)
133+
134+
// Assert
135+
Expect(err).ToNot(HaveOccurred())
136+
Expect(result.ServiceLogPrepared.Performed).To(BeTrue())
137+
})
138+
})
139+
140+
When("Kubernetes client fails to list ClusterVersion", func() {
141+
It("Should escalate with a warning note", func() {
142+
// Setup failing Kubernetes client by mocking List to fail
143+
fakeClient = fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects().Build()
144+
inv.kclient = fakeClient
145+
146+
// Arrange
147+
awsCli.EXPECT().GetSecurityGroupID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return("sg-123", nil)
148+
awsCli.EXPECT().GetSubnetID(gomock.Eq(clusterDeployment.Spec.ClusterMetadata.InfraID)).Return([]string{"subnet-1"}, nil)
149+
pdClient.EXPECT().EscalateIncidentWithNote(gomock.Any()).DoAndReturn(func(note string) error {
150+
Expect(note).To(ContainSubstring("Network verifier passed"))
151+
Expect(note).To(ContainSubstring("Failed to list ClusterVersion"))
152+
Expect(note).To(ContainSubstring("This may indicate cluster access issues"))
153+
Expect(note).To(ContainSubstring("Alert escalated to on-call primary for review"))
154+
return nil
155+
})
156+
157+
// Act
158+
_, err := inv.Run(resources)
159+
160+
// Assert
161+
Expect(err).ToNot(HaveOccurred())
162+
})
163+
})
164+
})
165+
})
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# cannotretrieveupdatessre Investigation
2+
3+
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and posting some cluster version errors.
4+
5+
## Investigation Logic
6+
7+
The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
8+
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
9+
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.
10+
11+
## Testing
12+
13+
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Testing CannotRetrieveUpdatesSRE Investigation
2+
3+
TODO:
4+
- Add a test script or test objects to this directory for future maintainers to use
5+
- Edit this README file and add detailed instructions on how to use the script/objects to recreate the conditions for the investigation. Be sure to include any assumptions or prerequisites about the environment (disable hive syncsetting, etc)

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package investigations
22

33
import (
4+
CannotRetrieveUpdatesSRE "github.com/openshift/configuration-anomaly-detection/pkg/investigations/CannotRetrieveUpdatesSRE"
45
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
56
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
67
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
@@ -16,6 +17,7 @@ var availableInvestigations = []investigation.Investigation{
1617
&clustermonitoringerrorbudgetburn.Investigation{},
1718
&cpd.Investigation{},
1819
&insightsoperatordown.Investigation{},
20+
&CannotRetrieveUpdatesSRE.Investigation{},
1921
}
2022

2123
// GetInvestigation returns the first Investigation that applies to the given alert title.

test/generate_incident.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/bin/zsh
22
set -e
33

44
# Define the mapping of alert names to titles
@@ -8,6 +8,7 @@ declare -A alert_mapping=(
88
["ClusterProvisioningDelay"]="ClusterProvisioningDelay -"
99
["ClusterMonitoringErrorBudgetBurnSRE"]="ClusterMonitoringErrorBudgetBurnSRE Critical (1)"
1010
["InsightsOperatorDown"]="InsightsOperatorDown"
11+
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
1112
)
1213

1314
# Function to print help message

0 commit comments

Comments
 (0)