Skip to content

Commit

Permalink
Addressing Leonid's comments
Browse files Browse the repository at this point in the history
Signed-off-by: Lazar Cvetković <[email protected]>
  • Loading branch information
cvetkovic committed Oct 28, 2024
1 parent 5381a8d commit deb52e7
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 51 deletions.
3 changes: 3 additions & 0 deletions pkg/driver/failure/knative_delete_control_plane.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,7 @@ kubectl delete pod $(kubectl get pods -n kube-system -o name | cut -c 5- | grep
# kube-scheduler
kubectl delete pod $(kubectl get pods -n kube-system -o name | cut -c 5- | grep kube-scheduler | tail -n 1) -n kube-system &

# istiod
kubectl delete pod $(kubectl get pods -n istio-system -o name | grep istiod | cut -c 5- | tail -n 1) -n istio-system &

# TODO: make an automatic way to choose leaders instead of picking a random one to kill
3 changes: 0 additions & 3 deletions pkg/driver/failure/knative_delete_data_plane.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,3 @@ kubectl delete pod $(kubectl get pods -n istio-system -o name | grep cluster-loc

# istio-ingressgateway
kubectl delete pod $(kubectl get pods -n istio-system -o name | grep istio-ingressgateway | cut -c 5- | tail -n 1) -n istio-system &

# istiod
kubectl delete pod $(kubectl get pods -n istio-system -o name | grep istiod | cut -c 5- | tail -n 1) -n istio-system &
90 changes: 42 additions & 48 deletions pkg/driver/failure/triggers.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,98 +9,92 @@ import (
"time"
)

const NodeSeparator = " "
const (
NodeSeparator = " "

ControlPlaneFailure = "control_plane"
DataPlaneFailure = "data_plane"
WorkerNodeFailure = "worker_node"
)

func ScheduleFailure(platform string, config *config.FailureConfiguration) {
if config != nil && config.FailAt != 0 && config.FailComponent != "" {
time.Sleep(time.Duration(config.FailAt) * time.Second)

switch platform {
case "Knative", "Knative-RPS":
triggerKnativeFailure(config.FailNode, config.FailComponent, config.FailAt)
triggerKnativeFailure(config.FailNode, config.FailComponent)
case "Dirigent", "Dirigent-RPS":
triggerDirigentFailure(config.FailNode, config.FailComponent, config.FailAt)
triggerDirigentFailure(config.FailNode, config.FailComponent)
default:
logrus.Errorf("No specified failure handler for given type of system.")
}
}
}

func triggerKnativeFailure(nodes string, component string, t int) {
time.Sleep(time.Duration(t) * time.Second)
func invokeRemotely(command []string, nodes string) {
splitNodes := strings.Split(nodes, NodeSeparator)
wg := &sync.WaitGroup{}

for _, node := range splitNodes {
wg.Add(1)

go func(command []string, node string) {
defer wg.Done()

finalCommand := append([]string{"ssh", "-o", "StrictHostKeyChecking=no", node}, command...)
invokeLocally(finalCommand)
}(command, node)
}

wg.Wait()
}

func triggerKnativeFailure(nodes string, component string) {
var command []string
switch component {
case "control_plane":
case ControlPlaneFailure:
command = []string{"bash", "./pkg/driver/failure/knative_delete_control_plane.sh"}
case "data_plane":
case DataPlaneFailure:
command = []string{"bash", "./pkg/driver/failure/knative_delete_data_plane.sh"}
case "worker_node":
case WorkerNodeFailure:
command = []string{"sudo", "systemctl", "restart", "kubelet"}
default:
logrus.Fatal("Invalid component to fail.")
}

if component != "worker_node" {
invokeCommand(command, t)
invokeLocally(command)
} else {
splitNodes := strings.Split(nodes, NodeSeparator)
wg := &sync.WaitGroup{}

for _, node := range splitNodes {
wg.Add(1)

go func(command []string, node string, t int) {
defer wg.Done()

finalCommand := append([]string{"ssh", "-o", "StrictHostKeyChecking=no", node}, command...)
invokeCommand(finalCommand, t)
}(command, node, t)
}

wg.Wait()
invokeRemotely(command, nodes)
}
}

func triggerDirigentFailure(nodes string, component string, t int) {
time.Sleep(time.Duration(t) * time.Second)

func triggerDirigentFailure(nodes string, component string) {
var command []string
switch component {
case "control_plane":
case ControlPlaneFailure:
command = []string{"sudo", "systemctl", "restart", "control_plane"}
case "data_plane":
case DataPlaneFailure:
command = []string{"sudo", "systemctl", "restart", "data_plane"}
case "worker_node":
case WorkerNodeFailure:
command = []string{"sudo", "systemctl", "restart", "worker_node"}
default:
logrus.Fatal("Invalid component to fail.")
}

if nodes == "" {
invokeCommand(command, t)
invokeLocally(command)
} else {
splitNodes := strings.Split(nodes, " ")
wg := &sync.WaitGroup{}

for _, node := range splitNodes {
wg.Add(1)

go func(command []string, node string, t int) {
defer wg.Done()

finalCommand := append([]string{"ssh", "-o", "StrictHostKeyChecking=no", node}, command...)
invokeCommand(finalCommand, t)
}(command, node, t)
}

wg.Wait()
invokeRemotely(command, nodes)
}
}

func invokeCommand(command []string, t int) {
func invokeLocally(command []string) {
cmd := exec.Command(command[0], command[1:]...)
output, err := cmd.CombinedOutput()
if err != nil {
logrus.Errorf("Error triggering %s failure at t = %d - %v", command, t, err)
logrus.Errorf("Error triggering %s failure - %v", command, err)
return
}

Expand Down

0 comments on commit deb52e7

Please sign in to comment.