Skip to content

Commit

Permalink
Merge pull request #511 from mjlshen/OSD-20879
Browse files Browse the repository at this point in the history
Add osdctl command to remediate OCPBUGS-23174
  • Loading branch information
openshift-merge-bot[bot] authored Feb 12, 2024
2 parents 845f1e8 + ca2f6b5 commit 8b9662c
Show file tree
Hide file tree
Showing 6 changed files with 329 additions and 10 deletions.
181 changes: 181 additions & 0 deletions cmd/cluster/cleanup_leaked_ec2.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package cluster

import (
"context"
"fmt"
"log"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
"github.com/openshift/osdctl/pkg/k8s"
"github.com/openshift/osdctl/pkg/osdCloud"
"github.com/openshift/osdctl/pkg/utils"
"github.com/spf13/cobra"
"k8s.io/apimachinery/pkg/runtime"
capav1beta2 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type cleanup struct {
awsClient cleanupAWSClient
client client.Client
cluster *cmv1.Cluster
mgmtCluster *cmv1.Cluster

// ClusterId is the internal or external OCM cluster ID.
// This is optional, but typically is used to automatically detect the correct settings.
ClusterId string
// Yes default confirmation prompts to yes
Yes bool
}

type cleanupAWSClient interface {
DescribeInstances(ctx context.Context, params *ec2.DescribeInstancesInput, optFns ...func(options *ec2.Options)) (*ec2.DescribeInstancesOutput, error)
TerminateInstances(ctx context.Context, params *ec2.TerminateInstancesInput, optFns ...func(options *ec2.Options)) (*ec2.TerminateInstancesOutput, error)
}

func newCmdCleanupLeakedEC2() *cobra.Command {
c := &cleanup{}

cleanupCmd := &cobra.Command{
Use: "cleanup-leaked-ec2",
Short: "Remediate impact of https://issues.redhat.com/browse/OCPBUGS-23174",
Example: `
# Run against a given ROSA HCP cluster
osdctl cluster cleanup-leaked-ec2 --cluster-id ${CLUSTER_ID}
# Assess all "error" state ROSA HCP clusters for impact
for cluster in $(ocm list cluster -p search="hypershift.enabled='true' and state='error'" --columns='id' --no-headers);
do
osdctl cluster cleanup-leaked-ec2 --cluster-id ${cluster}
done
`,
RunE: func(cmd *cobra.Command, args []string) error {
return c.Run(context.Background())
},
}

cleanupCmd.Flags().StringVarP(&c.ClusterId, "cluster-id", "C", "", "OCM internal/external cluster id to check for impact of OCPBUGS-23174.")
cleanupCmd.Flags().BoolVarP(&c.Yes, "yes", "y", false, "(optional) Skip confirmation prompt when terminating instances")

cleanupCmd.MarkFlagRequired("cluster-id")

return cleanupCmd
}

func (c *cleanup) New(ctx context.Context) error {
log.Printf("searching OCM for cluster: %s", c.ClusterId)
conn, err := utils.CreateConnection()
if err != nil {
return err
}
defer conn.Close()

cluster, err := utils.GetClusterAnyStatus(conn, c.ClusterId)
if err != nil {
return fmt.Errorf("failed to get OCM cluster info for %s: %v", c.ClusterId, err)
}
c.cluster = cluster
log.Printf("cluster %s found from OCM: %s", c.ClusterId, cluster.ID())

if !cluster.Hypershift().Enabled() {
return fmt.Errorf("this command is only meant for ROSA HCP clusters")
}

log.Printf("getting AWS credentials from backplane-api")
cfg, err := osdCloud.CreateAWSV2Config(conn, c.cluster)
if err != nil {
return fmt.Errorf("failed to get credentials automatically from backplane-api: %v", err)
}
log.Println(ctx, "retrieved AWS credentials from backplane-api")
c.awsClient = ec2.NewFromConfig(cfg)

mgmtCluster, err := utils.GetManagementCluster(c.cluster.ID())
if err != nil {
return err
}
c.mgmtCluster = mgmtCluster

scheme := runtime.NewScheme()
if err := capav1beta2.AddToScheme(scheme); err != nil {
return err
}
client, err := k8s.New(c.mgmtCluster.ID(), client.Options{Scheme: scheme})
if err != nil {
return err
}
c.client = client

return nil
}

func (c *cleanup) Run(ctx context.Context) error {
if err := c.New(ctx); err != nil {
return err
}

if err := c.RemediateOCPBUGS23174(ctx); err != nil {
return err
}

return nil
}

func (c *cleanup) RemediateOCPBUGS23174(ctx context.Context) error {
awsmachines := &capav1beta2.AWSMachineList{}
if err := c.client.List(ctx, awsmachines, client.MatchingLabels{
"cluster.x-k8s.io/cluster-name": c.cluster.ID(),
}); err != nil {
return err
}

expectedInstances := map[string]bool{}
for _, awsmachine := range awsmachines.Items {
expectedInstances[*awsmachine.Spec.InstanceID] = true
}
log.Printf("expected instances: %v", expectedInstances)

resp, err := c.awsClient.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
Filters: []types.Filter{
{
Name: aws.String("tag:red-hat-managed"),
Values: []string{"true"},
},
{
Name: aws.String(fmt.Sprintf("tag:sigs.k8s.io/cluster-api-provider-aws/cluster/%s", c.cluster.ID())),
Values: []string{"owned"},
},
},
})
if err != nil {
return fmt.Errorf("failed to find EC2 instances associated with %s: %v", c.cluster.ID(), err)
}

leakedInstances := []string{}
for _, reservation := range resp.Reservations {
for _, instance := range reservation.Instances {
if _, ok := expectedInstances[*instance.InstanceId]; !ok {
leakedInstances = append(leakedInstances, *instance.InstanceId)
}
}
}

if len(leakedInstances) > 0 {
log.Printf("terminating %d leaked instances: %v", len(leakedInstances), leakedInstances)
if c.Yes || utils.ConfirmPrompt() {
if _, err := c.awsClient.TerminateInstances(ctx, &ec2.TerminateInstancesInput{
InstanceIds: leakedInstances,
}); err != nil {
return fmt.Errorf("failed to automatically cleanup EC2 instances: %v", err)
}

log.Printf("success - the cluster should be uninstalled soon")
return nil
}
}

log.Println("found 0 leaked instances")
return nil
}
115 changes: 115 additions & 0 deletions cmd/cluster/cleanup_leaked_ec2_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package cluster

import (
"context"
"testing"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
capav1beta2 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

type mockCleanupAWSClient struct {
describeInstancesResp *ec2.DescribeInstancesOutput
terminateInstancesResp *ec2.TerminateInstancesOutput
}

func (m mockCleanupAWSClient) DescribeInstances(ctx context.Context, params *ec2.DescribeInstancesInput, optFns ...func(options *ec2.Options)) (*ec2.DescribeInstancesOutput, error) {
return m.describeInstancesResp, nil
}

func (m mockCleanupAWSClient) TerminateInstances(ctx context.Context, params *ec2.TerminateInstancesInput, optFns ...func(options *ec2.Options)) (*ec2.TerminateInstancesOutput, error) {
return m.terminateInstancesResp, nil
}

// newTestCluster assembles a *cmv1.Cluster while handling the error to help out with inline test-case generation
func newTestCluster(t *testing.T, cb *cmv1.ClusterBuilder) *cmv1.Cluster {
cluster, err := cb.Build()
if err != nil {
t.Fatalf("failed to build cluster: %s", err)
}

return cluster
}

func Test_cleanup_RemediateOCPBUGS23174(t *testing.T) {
scheme := runtime.NewScheme()
if err := capav1beta2.AddToScheme(scheme); err != nil {
t.Fatal(err)
}

tests := []struct {
name string
c *cleanup
expectErr bool
}{
{
name: "awsmachines match EC2 instances",
c: &cleanup{
awsClient: mockCleanupAWSClient{
describeInstancesResp: &ec2.DescribeInstancesOutput{
Reservations: []types.Reservation{
{
Instances: []types.Instance{
{
InstanceId: aws.String("i-0123456789"),
},
},
},
},
},
},
client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(&capav1beta2.AWSMachine{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"cluster.x-k8s.io/cluster-name": "0123456789",
},
},
Spec: capav1beta2.AWSMachineSpec{InstanceID: aws.String("i-0123456789")},
}).Build(),
cluster: newTestCluster(t, cmv1.NewCluster().ID("0123456789")),
},
},
{
name: "leaked EC2 instances",
c: &cleanup{
awsClient: mockCleanupAWSClient{
describeInstancesResp: &ec2.DescribeInstancesOutput{
Reservations: []types.Reservation{
{
Instances: []types.Instance{
{
InstanceId: aws.String("i-0123456789"),
},
},
},
},
},
},
client: fake.NewClientBuilder().WithScheme(scheme).Build(),
cluster: newTestCluster(t, cmv1.NewCluster().ID("0123456789")),
Yes: true,
},
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
err := test.c.RemediateOCPBUGS23174(context.Background())
if err != nil {
if !test.expectErr {
t.Errorf("expected no err, got %v", err)
}
}

if test.expectErr {
t.Errorf("expected err, got nil")
}
})
}
}
1 change: 1 addition & 0 deletions cmd/cluster/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func NewCmdCluster(streams genericclioptions.IOStreams, client *k8s.LazyClient,
clusterCmd.AddCommand(NewCmdHypershiftInfo(streams))
clusterCmd.AddCommand(newCmdOrgId())
clusterCmd.AddCommand(newCmdDynatraceURL())
clusterCmd.AddCommand(newCmdCleanupLeakedEC2())
return clusterCmd
}

Expand Down
11 changes: 8 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ require (
github.com/shopspring/decimal v1.3.1
github.com/sirupsen/logrus v1.9.3
github.com/spf13/cobra v1.8.0
github.com/spf13/pflag v1.0.5
github.com/spf13/pflag v1.0.6-0.20210604193023-d5e0c0615ace
github.com/spf13/viper v1.18.2
go.uber.org/mock v0.3.0
golang.org/x/sync v0.5.0
Expand All @@ -58,6 +58,7 @@ require (
k8s.io/client-go v0.28.4
k8s.io/kubectl v0.28.4
k8s.io/utils v0.0.0-20230726121419-3b25d923346b
sigs.k8s.io/cluster-api-provider-aws/v2 v2.3.1
sigs.k8s.io/controller-runtime v0.16.3
sigs.k8s.io/yaml v1.4.0
)
Expand All @@ -83,6 +84,8 @@ require (
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.21.7 // indirect
github.com/aymerick/douceur v0.2.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver v3.5.1+incompatible // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/chai2010/gettext-go v1.0.2 // indirect
Expand Down Expand Up @@ -118,7 +121,7 @@ require (
github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.0 // indirect
github.com/gorilla/css v1.0.0 // indirect
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
github.com/gregjones/httpcache v0.0.0-20190212212710-3befbb6ad0cc // indirect
github.com/hashicorp/go-version v1.6.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/imdario/mergo v0.3.15 // indirect
Expand All @@ -133,7 +136,7 @@ require (
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.9 // indirect
github.com/mattn/go-runewidth v0.0.14 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d // indirect
github.com/microcosm-cc/bluemonday v1.0.23 // indirect
Expand All @@ -159,6 +162,7 @@ require (
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/rivo/uniseg v0.4.2 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
Expand Down Expand Up @@ -196,6 +200,7 @@ require (
k8s.io/component-base v0.28.4 // indirect
k8s.io/klog/v2 v2.100.1 // indirect
k8s.io/kube-openapi v0.0.0-20231113174909-778a5567bc1e // indirect
sigs.k8s.io/cluster-api v1.5.3 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/kustomize/api v0.15.0 // indirect
sigs.k8s.io/kustomize/kyaml v0.15.0 // indirect
Expand Down
Loading

0 comments on commit 8b9662c

Please sign in to comment.