From 151b40efde605dacbaccfade4ece65e76a0c04c5 Mon Sep 17 00:00:00 2001 From: RaphaelBut Date: Fri, 14 Mar 2025 13:10:02 +0000 Subject: [PATCH 1/5] Only require aws client for investigations that need it This comes out of a need to run backplane-api locally, but its unable to get aws access. Also, part of the bigger picture of only initializing required integrations. --- cadctl/cmd/investigate/investigate.go | 22 ++++++++++++------- pkg/investigations/ccam/ccam.go | 4 ++++ pkg/investigations/chgm/chgm.go | 16 +++++++++----- pkg/investigations/chgm/chgm_test.go | 2 +- .../clustermonitoringerrorbudgetburn.go | 4 ++++ pkg/investigations/cpd/cpd.go | 3 +++ .../investigation/investigation.go | 1 + pkg/investigations/registry.go | 4 +++- 8 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cadctl/cmd/investigate/investigate.go b/cadctl/cmd/investigate/investigate.go index 24b6ca98..0d01b78b 100644 --- a/cadctl/cmd/investigate/investigate.go +++ b/cadctl/cmd/investigate/investigate.go @@ -22,6 +22,7 @@ import ( "path/filepath" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" + "github.com/openshift/configuration-anomaly-detection/pkg/aws" investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam" investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" @@ -129,18 +130,23 @@ func run(cmd *cobra.Command, _ []string) error { return fmt.Errorf("could not retrieve Cluster Deployment for %s: %w", internalClusterID, err) } - customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient) - if err != nil { - ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, AdditionalResources: map[string]interface{}{"error": err}} - inv := ccam.Investigation{} - result, err := inv.Run(ccamResources) - updateMetrics(alertInvestigation.Name(), &result) - return err + var customerAwsClient *aws.SdkClient + if alertInvestigation.RequiresAwsClient() { + customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient) + if err != nil { + ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, AdditionalResources: map[string]interface{}{"error": err}} + inv := ccam.Investigation{} + result, err := inv.Run(ccamResources) + updateMetrics(alertInvestigation.Name(), &result) + return err + } + } else { + customerAwsClient = &aws.SdkClient{} } investigationResources := &investigation.Resources{Name: alertInvestigation.Name(), Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient} - logging.Infof("Starting investigation for %s", alertInvestigation.Name) + logging.Infof("Starting investigation for %s", alertInvestigation.Name()) result, err := alertInvestigation.Run(investigationResources) updateMetrics(alertInvestigation.Name(), &result) return err diff --git a/pkg/investigations/ccam/ccam.go b/pkg/investigations/ccam/ccam.go index d7492c8a..8cab43e4 100644 --- a/pkg/investigations/ccam/ccam.go +++ b/pkg/investigations/ccam/ccam.go @@ -19,6 +19,10 @@ var ccamLimitedSupport = &ocm.LimitedSupportReason{ Details: "Your cluster requires you to take action because Red Hat is not able to access the infrastructure with the provided credentials. Please restore the credentials and permissions provided during install", } +func (c *Investigation) RequiresAwsClient() bool { + return false +} + // Evaluate estimates if the awsError is a cluster credentials are missing error. If it determines that it is, // the cluster is placed into limited support (if the cluster state allows it), otherwise an error is returned. func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { diff --git a/pkg/investigations/chgm/chgm.go b/pkg/investigations/chgm/chgm.go index d292e166..c6087798 100644 --- a/pkg/investigations/chgm/chgm.go +++ b/pkg/investigations/chgm/chgm.go @@ -36,10 +36,14 @@ var ( } ) -type Investiation struct{} +type Investigation struct{} + +func (c *Investigation) RequiresAwsClient() bool { + return true +} // Run runs the investigation for a triggered chgm pagerduty event -func (c *Investiation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { +func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { result := investigation.InvestigationResult{} notes := notewriter.New("CHGM", logging.RawLogger) @@ -118,19 +122,19 @@ func (c *Investiation) Run(r *investigation.Resources) (investigation.Investigat return result, r.PdClient.EscalateIncidentWithNote(notes.String()) } -func (c *Investiation) Name() string { +func (c *Investigation) Name() string { return "Cluster Has Gone Missing (CHGM)" } -func (c *Investiation) Description() string { +func (c *Investigation) Description() string { return "Detects reason for clusters that have gone missing" } -func (c *Investiation) ShouldInvestigateAlert(alert string) bool { +func (c *Investigation) ShouldInvestigateAlert(alert string) bool { return strings.Contains(alert, "has gone missing") } -func (c *Investiation) IsExperimental() bool { +func (c *Investigation) IsExperimental() bool { return false } diff --git a/pkg/investigations/chgm/chgm_test.go b/pkg/investigations/chgm/chgm_test.go index 76234fee..86b51907 100644 --- a/pkg/investigations/chgm/chgm_test.go +++ b/pkg/investigations/chgm/chgm_test.go @@ -92,7 +92,7 @@ var _ = Describe("chgm", func() { mockCtrl.Finish() }) - inv := Investiation{} + inv := Investigation{} Describe("Triggered", func() { When("Triggered finds instances stopped by the customer", func() { diff --git a/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go b/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go index 233d6708..3310ecb5 100644 --- a/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go +++ b/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go @@ -27,6 +27,10 @@ var uwmMisconfiguredSL = ocm.ServiceLog{ type Investigation struct{} +func (c *Investigation) RequiresAwsClient() bool { + return false +} + func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { // Initialize k8s client // This would be better suited to be passend in with the investigation resources diff --git a/pkg/investigations/cpd/cpd.go b/pkg/investigations/cpd/cpd.go index 1b320b2a..1ae573d7 100644 --- a/pkg/investigations/cpd/cpd.go +++ b/pkg/investigations/cpd/cpd.go @@ -14,6 +14,9 @@ import ( type Investigation struct{} +func (c *Investigation) RequiresAwsClient() bool { + return true +} // https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/aws/InstallFailed_NoRouteToInternet.json var byovpcRoutingSL = &ocm.ServiceLog{Severity: "Major", Summary: "Installation blocked: Missing route to internet", Description: "Your cluster's installation is blocked because of the missing route to internet in the route table(s) associated with the supplied subnet(s) for cluster installation. Please review and validate the routes by following documentation and re-install the cluster: https://docs.openshift.com/container-platform/latest/installing/installing_aws/installing-aws-vpc.html#installation-custom-aws-vpc-requirements_installing-aws-vpc.", InternalOnly: false, ServiceName: "SREManualAction"} diff --git a/pkg/investigations/investigation/investigation.go b/pkg/investigations/investigation/investigation.go index 0579480d..7a7da32f 100644 --- a/pkg/investigations/investigation/investigation.go +++ b/pkg/investigations/investigation/investigation.go @@ -28,6 +28,7 @@ type Investigation interface { Description() string IsExperimental() bool ShouldInvestigateAlert(string) bool + RequiresAwsClient() bool } // Resources holds all resources/tools required for alert investigations diff --git a/pkg/investigations/registry.go b/pkg/investigations/registry.go index 1fca735c..247c3c04 100644 --- a/pkg/investigations/registry.go +++ b/pkg/investigations/registry.go @@ -6,12 +6,13 @@ import ( "github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/cpd" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" + "github.com/openshift/configuration-anomaly-detection/pkg/logging" ) // availableInvestigations holds all Investigation implementations. var availableInvestigations = []investigation.Investigation{ &ccam.Investigation{}, - &chgm.Investiation{}, + &chgm.Investigation{}, &clustermonitoringerrorbudgetburn.Investigation{}, &cpd.Investigation{}, } @@ -26,5 +27,6 @@ func GetInvestigation(title string, experimental bool) investigation.Investigati return inv } } + logging.Debugf("No investigation found for: %s", title) return nil } From 11acd8faf54c092aa97be97582ac61b12c9c9304 Mon Sep 17 00:00:00 2001 From: RaphaelBut Date: Fri, 14 Mar 2025 13:17:23 +0000 Subject: [PATCH 2/5] Improve dev environment setup and add utility to run backplane locally --- .gitignore | 3 ++- README.md | 36 ++++++++++++++++++++++++++---------- test/backplane.sh | 17 +++++++++++++++++ test/set_stage_env.sh | 3 ++- 4 files changed, 47 insertions(+), 12 deletions(-) create mode 100755 test/backplane.sh diff --git a/.gitignore b/.gitignore index 8185a012..dd8aa446 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ dist .envrc .idea .vscode -cad_testing \ No newline at end of file +cad_testing +backplane-api diff --git a/README.md b/README.md index b22e7c1f..fa676493 100644 --- a/README.md +++ b/README.md @@ -90,23 +90,39 @@ They are initialized for you and passed to the investigation via investigation.R ## Testing locally ### Pre-requirements -- an existing cluster -- an existing PagerDuty incident for the cluster and alert type that is being tested +- An existing stage cluster +- A Pagerduty incident -To quickly create an incident for a cluster_id, you can run `./test/generate_incident.sh `. -Example usage:`./test/generate_incident.sh ClusterHasGoneMissing 2b94brrrrrrrrrrrrrrrrrrhkaj`. +```bash +# (Optional) Export you pagerduty token to automatically retireve the incident id +export pd_token= +# Generates incident and creates payload file with incident ID +./test/generate_incident.sh +``` -### Running cadctl for an incident ID -1) Export the required ENV variables, see [required ENV variables](#required-env-variables). -2) Create a payload file containing the incident ID +If you are not using pd_token, create the payload file with the incidentID manually ```bash export INCIDENT_ID= echo '{"__pd_metadata":{"incident":{"id":"'${INCIDENT_ID}'"}}}' > ./payload ``` + +### Running cadctl + +1) Run backplane-api locally in a second terminal ( requires being logged into ocm ) + + ``` + ./test/backplane.sh + ``` + + > If there is an issue with this step, comment out the `BACKPLANE_URL` env in `set_stage_env.sh`. You will then run against stage backplane, meaning backplane wont be able to see any local changes to metadata files, expect errors like `file not found` +2) Export the required ENV variables, see [required ENV variables](#required-env-variables). + ``` + source test/set_stage_env.sh + ``` 3) Run `cadctl` using the payload file - ```bash - ./bin/cadctl investigate --payload-path payload - ``` + ```bash + ./bin/cadctl investigate --payload-path payload + ``` ### Logging levels diff --git a/test/backplane.sh b/test/backplane.sh new file mode 100755 index 00000000..47ffa521 --- /dev/null +++ b/test/backplane.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -euo pipefail + +# clone +git -C backplane-api pull || git clone --depth 1 --branch master git@gitlab.cee.redhat.com:service/backplane-api.git +# build +cd backplane-api +make build +# setup, this does not look to good :D +sudo make dev-certs +sudo chmod 644 localhost.key +# setup ocm config +cp $HOME/.config/ocm/ocm.json configs/ocm.json +# run, in background? second terminal ? +RUN_ARGS=--cloud-config=./configs/cloud-config.yml make run-local-with-testremediation GIT_REPO="../" + + diff --git a/test/set_stage_env.sh b/test/set_stage_env.sh index b4c8cb63..70e4ec20 100755 --- a/test/set_stage_env.sh +++ b/test/set_stage_env.sh @@ -10,6 +10,7 @@ for v in $(vault kv get -format=json osd-sre/configuration-anomaly-detection/pd unset VAULT_ADDR VAULT_TOKEN export CAD_EXPERIMENTAL_ENABLED=true -export BACKPLANE_PROXY=http://squid.corp.redhat.com:3128 +# export BACKPLANE_PROXY=http://squid.corp.redhat.com:3128 +export BACKPLANE_URL=https://localhost:8001 set +euo pipefail From b707bacebf3dd0defff905bc23601abc221a76a2 Mon Sep 17 00:00:00 2001 From: RaphaelBut Date: Fri, 14 Mar 2025 15:37:50 +0000 Subject: [PATCH 3/5] rebase --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index fa676493..7daad2f4 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,8 @@ If you are not using pd_token, create the payload file with the incidentID manua ./bin/cadctl investigate --payload-path payload ``` + > If you are testing a new invesitigation using k8sclient, you need to run backplane locally and the metadata file needs to be temporarily commited to main. + ### Logging levels CAD allows for different logging levels (debug, info, warn, error, fatal, panic). The log level is determind through a hierarchy, where the cli flag `log-level` From c586d759d82a605c3ecb25d1a2a274ea2e460128 Mon Sep 17 00:00:00 2001 From: RaphaelBut Date: Fri, 14 Mar 2025 15:58:09 +0000 Subject: [PATCH 4/5] format --- pkg/investigations/chgm/chgm.go | 2 +- pkg/investigations/cpd/cpd.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/investigations/chgm/chgm.go b/pkg/investigations/chgm/chgm.go index c6087798..0af732ea 100644 --- a/pkg/investigations/chgm/chgm.go +++ b/pkg/investigations/chgm/chgm.go @@ -38,10 +38,10 @@ var ( type Investigation struct{} - func (c *Investigation) RequiresAwsClient() bool { return true } + // Run runs the investigation for a triggered chgm pagerduty event func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { result := investigation.InvestigationResult{} diff --git a/pkg/investigations/cpd/cpd.go b/pkg/investigations/cpd/cpd.go index 1ae573d7..7076ca9a 100644 --- a/pkg/investigations/cpd/cpd.go +++ b/pkg/investigations/cpd/cpd.go @@ -17,6 +17,7 @@ type Investigation struct{} func (c *Investigation) RequiresAwsClient() bool { return true } + // https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/aws/InstallFailed_NoRouteToInternet.json var byovpcRoutingSL = &ocm.ServiceLog{Severity: "Major", Summary: "Installation blocked: Missing route to internet", Description: "Your cluster's installation is blocked because of the missing route to internet in the route table(s) associated with the supplied subnet(s) for cluster installation. Please review and validate the routes by following documentation and re-install the cluster: https://docs.openshift.com/container-platform/latest/installing/installing_aws/installing-aws-vpc.html#installation-custom-aws-vpc-requirements_installing-aws-vpc.", InternalOnly: false, ServiceName: "SREManualAction"} From 019cc6026a0369bbe38fb015c58abb2eb9797968 Mon Sep 17 00:00:00 2001 From: RaphaelBut Date: Fri, 14 Mar 2025 15:58:56 +0000 Subject: [PATCH 5/5] lint: len() for nil slices is defined as zero --- pkg/aws/aws.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/aws/aws.go b/pkg/aws/aws.go index 1afb0cc8..22e3fdd5 100644 --- a/pkg/aws/aws.go +++ b/pkg/aws/aws.go @@ -625,7 +625,7 @@ func eventContainsInstances(instances []ec2v2types.Instance, event cloudtrailv2t func getTime(rawReason string) (time.Time, error) { subMatches := stopInstanceDateRegex.FindStringSubmatch(rawReason) - if subMatches == nil || len(subMatches) < 2 { + if len(subMatches) < 2 { return time.Time{}, fmt.Errorf("did not find matches: raw data %s", rawReason) } if len(subMatches) != 2 {