From 8b7b8f65685ee306ae0e611ddb1d9ef4d7276880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lazar=20Cvetkovi=C4=87?= Date: Thu, 21 Mar 2024 16:53:35 +0100 Subject: [PATCH 1/8] Rewrite YAMLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lazar Cvetković --- scripts/setup/create_multinode.sh | 16 ++++++++-------- scripts/setup/rewrite_yaml_files.sh | 12 ------------ 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/scripts/setup/create_multinode.sh b/scripts/setup/create_multinode.sh index 70324384f..1d7fde802 100755 --- a/scripts/setup/create_multinode.sh +++ b/scripts/setup/create_multinode.sh @@ -72,7 +72,7 @@ common_init() { server_exec $1 'sudo chronyc tracking' clone_loader $1 - server_exec $1 '~/loader/scripts/setup/stabilize.sh' + #server_exec $1 '~/loader/scripts/setup/stabilize.sh' } for node in "$@" @@ -137,15 +137,15 @@ function setup_workers() { # Stretch the capacity of the worker node to 240 (k8s default: 110) # Empirically, this gives us a max. #pods being 240-40=200 - echo "Stretching node capacity for $node." - server_exec $node "echo \"maxPods: ${PODS_PER_NODE}\" > >(sudo tee -a /var/lib/kubelet/config.yaml >/dev/null)" - server_exec $node "echo \"containerLogMaxSize: 512Mi\" > >(sudo tee -a /var/lib/kubelet/config.yaml >/dev/null)" - server_exec $node 'sudo systemctl restart kubelet' - server_exec $node 'sleep 10' + #echo "Stretching node capacity for $node." + #server_exec $node "echo \"maxPods: ${PODS_PER_NODE}\" > >(sudo tee -a /var/lib/kubelet/config.yaml >/dev/null)" + #server_exec $node "echo \"containerLogMaxSize: 512Mi\" > >(sudo tee -a /var/lib/kubelet/config.yaml >/dev/null)" + #server_exec $node 'sudo systemctl restart kubelet' + #server_exec $node 'sleep 10' # Rejoin has to be performed although errors will be thrown. Otherwise, restarting the kubelet will cause the node unreachable for some reason - server_exec $node "sudo ${LOGIN_TOKEN} > /dev/null 2>&1" - echo "Worker node $node joined the cluster (again :P)." + #server_exec $node "sudo ${LOGIN_TOKEN} > /dev/null 2>&1" + #echo "Worker node $node joined the cluster (again :P)." } for node in "$@" diff --git a/scripts/setup/rewrite_yaml_files.sh b/scripts/setup/rewrite_yaml_files.sh index 5b0b884c3..22d32178f 100755 --- a/scripts/setup/rewrite_yaml_files.sh +++ b/scripts/setup/rewrite_yaml_files.sh @@ -46,18 +46,6 @@ sed -e '$d' > net-istio-yq.yaml # serving-core.yaml cat serving-core.yaml | - yq ' - ( - select - ( - .spec.template.metadata.labels.app == "activator" - or .spec.template.metadata.labels.app == "autoscaler" - or .spec.template.metadata.labels.app == "controller" - or .spec.template.metadata.labels.app == "domain-mapping" - or .spec.template.metadata.labels.app == "domainmapping-webhook" - or .spec.template.metadata.labels.app == "webhook" - ) | .spec.template.spec - ) += {"nodeSelector": {"loader-nodetype": "master"}}' | yq ' ( del From a459835a0b0580dd91df763e2dc9438691298bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lazar=20Cvetkovi=C4=87?= Date: Fri, 22 Mar 2024 00:48:13 +0100 Subject: [PATCH 2/8] Setup config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lazar Cvetković --- scripts/create_dirigent_config.py | 23 +++++++++++++++++++ scripts/setup/setup.cfg | 2 +- .../trigger_failures/delete_control_plane.sh | 8 +++++++ scripts/trigger_failures/delete_data_plane.sh | 8 +++++++ 4 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 scripts/create_dirigent_config.py create mode 100644 scripts/trigger_failures/delete_control_plane.sh create mode 100644 scripts/trigger_failures/delete_data_plane.sh diff --git a/scripts/create_dirigent_config.py b/scripts/create_dirigent_config.py new file mode 100644 index 000000000..1a9c17098 --- /dev/null +++ b/scripts/create_dirigent_config.py @@ -0,0 +1,23 @@ +import os +import pandas as pd + +path = '/home/lcvetkovic/Downloads/day6_hour8_samples/samples' + +for files in os.listdir(path): + df = pd.read_csv(f'{path}/{files}/invocations.csv') + dirigent = pd.DataFrame() + + for index, row in df.iterrows(): + function = row['HashFunction'] + + dirigent = dirigent.append( { + 'HashFunction': function, + 'Image': 'docker.io/cvetkovic/dirigent_trace_function:latest', + 'Port': 80, + 'Protocol': 'tcp', + 'ScalingUpperBound': 10000, + 'ScalingLowerBound': 0, + 'IterationMultiplier': 155, + }, ignore_index=True) + + dirigent.to_csv(f'{path}/{files}/dirigent.csv', index=False) \ No newline at end of file diff --git a/scripts/setup/setup.cfg b/scripts/setup/setup.cfg index 82729de11..83665e5c6 100755 --- a/scripts/setup/setup.cfg +++ b/scripts/setup/setup.cfg @@ -1,5 +1,5 @@ VHIVE_BRANCH='v1.7' -LOADER_BRANCH='main' +LOADER_BRANCH='remove_master_scheduling' CLUSTER_MODE='container' # choose from {container, firecracker, firecracker_snapshots} PODS_PER_NODE=240 DEPLOY_PROMETHEUS=false diff --git a/scripts/trigger_failures/delete_control_plane.sh b/scripts/trigger_failures/delete_control_plane.sh new file mode 100644 index 000000000..f5ae79d96 --- /dev/null +++ b/scripts/trigger_failures/delete_control_plane.sh @@ -0,0 +1,8 @@ +for s in $(kubectl get pods -n knative-serving -o name | cut -c 5- | grep -v activator); +do + kubectl delete pod $s -n knative-serving & +done +for s in $(kubectl get pods -n kube-system -o name | cut -c 5-); +do + kubectl delete pod $s -n kube-system & +done diff --git a/scripts/trigger_failures/delete_data_plane.sh b/scripts/trigger_failures/delete_data_plane.sh new file mode 100644 index 000000000..a2d15b5e1 --- /dev/null +++ b/scripts/trigger_failures/delete_data_plane.sh @@ -0,0 +1,8 @@ +for s in $(kubectl get pods -n knative-serving -o name | cut -c 5- | grep activator); +do + kubectl delete pod $s -n knative-serving & +done +for s in $(kubectl get pods -n istio-system -o name | cut -c 5- ); +do + kubectl delete pod $s -n istio-system & +done From b487d89aeeedb2f96b8b3037e4c01234bcb238b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lazar=20Cvetkovi=C4=87?= Date: Mon, 19 Feb 2024 14:12:25 +0100 Subject: [PATCH 3/8] K8s high-availability mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lazar Cvetković --- config/kube.json | 18 ++++++---- scripts/setup/create_multinode.sh | 60 ++++++++++++++++++++++++++----- scripts/setup/setup.cfg | 7 ++-- 3 files changed, 68 insertions(+), 17 deletions(-) diff --git a/config/kube.json b/config/kube.json index 15b367ca9..7f1c5ec96 100644 --- a/config/kube.json +++ b/config/kube.json @@ -1,10 +1,14 @@ { "K8sVersion": "1.29.1", - "AlternativeImageRepo": "", - "ApiserverAdvertiseAddress": "", - "PodNetworkCidr": "10.168.0.0/16", - "ApiserverPort": "6443", - "ApiserverToken": "", - "ApiserverTokenHash": "", - "CalicoVersion": "3.27.2" + "CalicoVersion": "3.27.2", + "AlternativeImageRepo": "", + "ApiserverAdvertiseAddress": "", + "PodNetworkCidr": "10.168.0.0/16", + "PodNetworkAddonConfigURL": "https://raw.githubusercontent.com/vhive-serverless/vHive/main/configs/calico/canal.yaml", + "ApiserverPort": "6443", + "ApiserverToken": "", + "ApiserverDiscoveryToken": "", + "ApiserverCertificateKey": "", + "CPHAEndpoint": "10.0.1.254", + "CPHAPort": "8443" } \ No newline at end of file diff --git a/scripts/setup/create_multinode.sh b/scripts/setup/create_multinode.sh index 1d7fde802..bf52c5681 100755 --- a/scripts/setup/create_multinode.sh +++ b/scripts/setup/create_multinode.sh @@ -52,6 +52,11 @@ if [ $PODS_PER_NODE -gt 1022 ]; then exit 1 fi +if [ "$#" -lt $CONTROL_PLANE_REPLICAS ]; then + echo "Not enough nodes to set up the requested number of control plane replicas." + exit 1 +fi + server_exec() { ssh -oStrictHostKeyChecking=no -p 22 "$1" "$2"; } @@ -60,7 +65,7 @@ common_init() { internal_init() { server_exec $1 "git clone --branch=$VHIVE_BRANCH https://github.com/vhive-serverless/vhive" - server_exec $1 "pushd ~/vhive/scripts > /dev/null && ./install_go.sh && source /etc/profile && go build -o setup_tool && ./setup_tool setup_node ${OPERATION_MODE} && popd > /dev/null" + server_exec $1 "pushd ~/vhive/scripts > /dev/null && ./install_go.sh && source /etc/profile && go build -o setup_tool && ./setup_tool setup_node $2 ${OPERATION_MODE} && popd > /dev/null" server_exec $1 'tmux new -s containerd -d' server_exec $1 'tmux send -t containerd "sudo containerd 2>&1 | tee ~/containerd_log.txt" ENTER' @@ -75,9 +80,19 @@ common_init() { #server_exec $1 '~/loader/scripts/setup/stabilize.sh' } + NODE_COUNTER=1 for node in "$@" do - internal_init "$node" & + # Set up API Server load balancer arguments + HA_SETTING="REGULAR" + if [ "$NODE_COUNTER" -eq 1 ]; then + HA_SETTING="MASTER" + elif [ "$NODE_COUNTER" -le $CONTROL_PLANE_REPLICAS ]; then + HA_SETTING="BACKUP" + fi + + internal_init "$node" $HA_SETTING & + let NODE_COUNTER++ done wait @@ -92,7 +107,11 @@ function setup_master() { server_exec $MASTER_NODE '~/loader/scripts/setup/rewrite_yaml_files.sh' +<<<<<<< HEAD MN_CLUSTER="pushd ~/vhive/scripts > /dev/null && ./setup_tool create_multinode_cluster ${OPERATION_MODE} && popd > /dev/null" +======= + MN_CLUSTER="pushd ~/vhive/scripts > /dev/null && ./setup_tool create_multinode_cluster ${OPERATION_MODE} ${CONTROL_PLANE_REPLICAS} && popd > /dev/null" +>>>>>>> 78e9b94 (K8s high-availability mode) server_exec "$MASTER_NODE" "tmux send -t master \"$MN_CLUSTER\" ENTER" # Get the join token from k8s. @@ -100,11 +119,19 @@ function setup_master() { sleep 1 done + MASTER_LOGIN_TOKEN=$(server_exec "$MASTER_NODE" \ + 'awk '\''/^ApiserverAdvertiseAddress:/ {ip=$2} \ + /^ApiserverPort:/ {port=$2} \ + /^ApiserverToken:/ {token=$2} \ + /^ApiserverDiscoveryToken:/ {discovery_token=$2} \ + /^ApiserverCertificateKey:/ {certificate_key=$2} \ + END {print "sudo kubeadm join " ip ":" port " --token " token " --discovery-token-ca-cert-hash " discovery_token " --control-plane --certificate-key " certificate_key}'\'' ~/vhive/scripts/masterKey.yaml') + LOGIN_TOKEN=$(server_exec "$MASTER_NODE" \ 'awk '\''/^ApiserverAdvertiseAddress:/ {ip=$2} \ /^ApiserverPort:/ {port=$2} \ /^ApiserverToken:/ {token=$2} \ - /^ApiserverTokenHash:/ {token_hash=$2} \ + /^ApiserverDiscoveryToken:/ {token_hash=$2} \ END {print "sudo kubeadm join " ip ":" port " --token " token " --discovery-token-ca-cert-hash " token_hash}'\'' ~/vhive/scripts/masterKey.yaml') } @@ -132,8 +159,13 @@ function setup_workers() { setup_vhive_firecracker_daemon $node fi - server_exec $node "sudo ${LOGIN_TOKEN}" - echo "Worker node $node has joined the cluster." + if [ $2 -eq "MASTER" ]; then + server_exec $node "sudo ${MASTER_LOGIN_TOKEN}" + echo "Backup master node $node has joined the cluster." + else + server_exec $node "sudo ${LOGIN_TOKEN}" + echo "Worker node $node has joined the cluster." + fi # Stretch the capacity of the worker node to 240 (k8s default: 110) # Empirically, this gives us a max. #pods being 240-40=200 @@ -144,13 +176,25 @@ function setup_workers() { #server_exec $node 'sleep 10' # Rejoin has to be performed although errors will be thrown. Otherwise, restarting the kubelet will cause the node unreachable for some reason - #server_exec $node "sudo ${LOGIN_TOKEN} > /dev/null 2>&1" - #echo "Worker node $node joined the cluster (again :P)." + #if [ $2 -eq "MASTER" ]; then + # server_exec $node "sudo ${MASTER_LOGIN_TOKEN} > /dev/null 2>&1" + # echo "Backup master node $node joined the cluster (again :P)." + #else + # server_exec $node "sudo ${LOGIN_TOKEN} > /dev/null 2>&1" + # echo "Worker node $node joined the cluster (again :P)." + #fi } + NODE_COUNTER=1 for node in "$@" do - internal_setup "$node" & + # Set up API Server load balancer arguments + HA_SETTING="" + if [ "$NODE_COUNTER" -le $CONTROL_PLANE_REPLICAS ]; then + HA_SETTING="MASTER" + fi + + internal_setup "$node" $HA_SETTING & done wait diff --git a/scripts/setup/setup.cfg b/scripts/setup/setup.cfg index 83665e5c6..1a7983be3 100755 --- a/scripts/setup/setup.cfg +++ b/scripts/setup/setup.cfg @@ -1,5 +1,8 @@ -VHIVE_BRANCH='v1.7' -LOADER_BRANCH='remove_master_scheduling' +VHIVE_BRANCH='k8s_ha_mode' +LOADER_BRANCH='ha_k8s' CLUSTER_MODE='container' # choose from {container, firecracker, firecracker_snapshots} PODS_PER_NODE=240 DEPLOY_PROMETHEUS=false + +# High-availability K8s control plane (default: 1, recommendation: 3 or 5) +CONTROL_PLANE_REPLICAS=3 From bd77ff344c7eba787c702023fdb2090f82217a15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lazar=20Cvetkovi=C4=87?= Date: Mon, 19 Feb 2024 17:50:16 +0100 Subject: [PATCH 4/8] Change of API Server port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lazar Cvetković --- config/kube.json | 2 +- scripts/setup/create_multinode.sh | 14 ++++++++++---- scripts/setup/setup.cfg | 6 +++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/config/kube.json b/config/kube.json index 7f1c5ec96..dbdeb5156 100644 --- a/config/kube.json +++ b/config/kube.json @@ -10,5 +10,5 @@ "ApiserverDiscoveryToken": "", "ApiserverCertificateKey": "", "CPHAEndpoint": "10.0.1.254", - "CPHAPort": "8443" + "CPHAPort": "6443" } \ No newline at end of file diff --git a/scripts/setup/create_multinode.sh b/scripts/setup/create_multinode.sh index bf52c5681..ac563af74 100755 --- a/scripts/setup/create_multinode.sh +++ b/scripts/setup/create_multinode.sh @@ -57,6 +57,11 @@ if [ "$#" -lt $CONTROL_PLANE_REPLICAS ]; then exit 1 fi +if [ "$CONTROL_PLANE_REPLICAS" != 1 ] && [ "$CONTROL_PLANE_REPLICAS" != 3 ] && [ "$CONTROL_PLANE_REPLICAS" != 5 ]; then + echo "Number of control plane replicas can only be 1, 3, or 5." + exit 1 +fi + server_exec() { ssh -oStrictHostKeyChecking=no -p 22 "$1" "$2"; } @@ -159,7 +164,7 @@ function setup_workers() { setup_vhive_firecracker_daemon $node fi - if [ $2 -eq "MASTER" ]; then + if [ "$2" = "MASTER" ]; then server_exec $node "sudo ${MASTER_LOGIN_TOKEN}" echo "Backup master node $node has joined the cluster." else @@ -188,13 +193,14 @@ function setup_workers() { NODE_COUNTER=1 for node in "$@" do - # Set up API Server load balancer arguments + # Set up API Server load balancer arguments - Less than because 1 CP is the "main" master node already HA_SETTING="" - if [ "$NODE_COUNTER" -le $CONTROL_PLANE_REPLICAS ]; then + if [ "$NODE_COUNTER" -lt $CONTROL_PLANE_REPLICAS ]; then HA_SETTING="MASTER" fi internal_setup "$node" $HA_SETTING & + let NODE_COUNTER++ done wait @@ -286,7 +292,7 @@ function copy_k8s_certificates() { namespace_info=$(server_exec $MASTER_NODE "kubectl get namespaces") done - echo "Master node $MASTER_NODE finalised." + echo "Master node $MASTER_NODE finalized." # Copy API server certificates from master to each worker node copy_k8s_certificates "$@" diff --git a/scripts/setup/setup.cfg b/scripts/setup/setup.cfg index 1a7983be3..dd5fe56fa 100755 --- a/scripts/setup/setup.cfg +++ b/scripts/setup/setup.cfg @@ -1,8 +1,8 @@ -VHIVE_BRANCH='k8s_ha_mode' -LOADER_BRANCH='ha_k8s' +VHIVE_BRANCH='main' +LOADER_BRANCH='main' CLUSTER_MODE='container' # choose from {container, firecracker, firecracker_snapshots} PODS_PER_NODE=240 DEPLOY_PROMETHEUS=false -# High-availability K8s control plane (default: 1, recommendation: 3 or 5) +# K8s control plane replicas (default: 1, for high-availability use 3 or 5) CONTROL_PLANE_REPLICAS=3 From 51daea0a6a52d9eaa35c5d75b5211108d48830ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lazar=20Cvetkovi=C4=87?= Date: Wed, 21 Feb 2024 11:09:39 +0100 Subject: [PATCH 5/8] Documentation for HA mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lazar Cvetković --- docs/loader.md | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/docs/loader.md b/docs/loader.md index 5ad1ca9f3..efe9d3689 100644 --- a/docs/loader.md +++ b/docs/loader.md @@ -12,9 +12,10 @@ can choose the APT cluster `d430` node. ### vHive cluster First, configure `script/setup/setup.cfg`. You can specify there which vHive branch to use, loader branch, operation -mode (sandbox type), and maximum number of pods per node. All these configurations are mandatory. We currently support +mode (sandbox type), maximum number of pods per node, whether to deploy Prometheus metrics collector, and the number of +Kubernetes control plane replicas (for high-availability). All these configurations are mandatory. We currently support the following modes: containerd (`container`), Firecracker (`firecracker`), and Firecracker with -snapshots (`firecracker_snapshots`).Loader will be cloned on every node specified as argument of the cluster create +snapshots (`firecracker_snapshots`). Loader will be cloned on every node specified as argument of the cluster create script. The same holds for Kubernetes API server certificate. * To create a multi-node cluster, specify the node addresses as the arguments and run the following command: @@ -124,9 +125,10 @@ For to configure the workload for load generator, please refer to `docs/configur There are a couple of constants that should not be exposed to the users. They can be examined and changed in `pkg/common/constants.go`. -Sample sizes appropriate for performance evaluation vary depending on the platform. -As a starting point for fine-tuning, we suggest at most 5 functions per core with SMT disabled. -For example, 80 functions for a 16-core node. With larger sample sizes, trace replaying may lead to failures in function invocations. +Sample sizes appropriate for performance evaluation vary depending on the platform. +As a starting point for fine-tuning, we suggest at most 5 functions per core with SMT disabled. +For example, 80 functions for a 16-core node. With larger sample sizes, trace replaying may lead to failures in function +invocations. ## Build the image for a synthetic function @@ -147,7 +149,7 @@ For testing cold start performance: $ make ``` -Pushing the images will require a write access to Github packages connected to this repository. Please refer to +Pushing the images will require a write access to Github packages connected to this repository. Please refer to [this guide](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-with-a-personal-access-token-classic) for authentication instructions. @@ -184,15 +186,23 @@ For instructions on how to use the loader with OpenWhisk go to `openwhisk_setup/ Currently supported vendors: AWS Setup Instructions: + 1. Sign up for a Serverless account [here](https://app.serverless.com/) -2. Install Serverless framework via command line `npm install -g serverless` to allow our Go code to interact with the Serverless CLI framework e.g. `sls deploy` -3. Follow their [setup guide](https://www.serverless.com/framework/docs/getting-started) to link the respective cloud providers +2. Install Serverless framework via command line `npm install -g serverless` to allow our Go code to interact with the + Serverless CLI framework e.g. `sls deploy` +3. Follow their [setup guide](https://www.serverless.com/framework/docs/getting-started) to link the respective cloud + providers To run the default Loader: + 1. Change the `Platform` value in `cmd/config.json` to one of those specified in `docs/configuration.md` -2. ~~Build the Go binary in Linux: `go build -v -o ./server/trace-func-go/aws/trace_func ./server/trace-func-go/aws/trace_func.go` (For Window users, please use WSL to build the binary)~~ +2. ~~Build the Go binary in + Linux: `go build -v -o ./server/trace-func-go/aws/trace_func ./server/trace-func-go/aws/trace_func.go` (For Window + users, please use WSL to build the binary)~~ 3. Start the experiments: `go run cmd/loader.go --config cmd/config.json --verbosity trace` Note: -- Current deployment is via zip file where the Go binary has to be built prior to running the code. For now, the Go binary has been packaged in this repo. Otherwise, refer to Step 2 above to build your custom Go binary. + +- Current deployment is via zip file where the Go binary has to be built prior to running the code. For now, the Go + binary has been packaged in this repo. Otherwise, refer to Step 2 above to build your custom Go binary. - Refer to [Single Execution](#single-execution) section for more details on the experiment configurations. \ No newline at end of file From 0969fac9e4b8b5e20aa4ce0eb18fe6921d601f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lazar=20Cvetkovi=C4=87?= Date: Wed, 21 Feb 2024 18:45:07 +0100 Subject: [PATCH 6/8] Fixed issues with node labelling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lazar Cvetković --- scripts/setup/create_multinode.sh | 39 +++++++++++++++++++------------ 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/scripts/setup/create_multinode.sh b/scripts/setup/create_multinode.sh index ac563af74..ddf587532 100755 --- a/scripts/setup/create_multinode.sh +++ b/scripts/setup/create_multinode.sh @@ -138,6 +138,9 @@ function setup_master() { /^ApiserverToken:/ {token=$2} \ /^ApiserverDiscoveryToken:/ {token_hash=$2} \ END {print "sudo kubeadm join " ip ":" port " --token " token " --discovery-token-ca-cert-hash " token_hash}'\'' ~/vhive/scripts/masterKey.yaml') + + server_exec $MASTER_NODE "kubectl taint nodes \$(hostname) node-role.kubernetes.io/control-plane-" + server_exec $MASTER_NODE "kubectl label nodes \$(hostname) loader-nodetype=master" } function setup_vhive_firecracker_daemon() { @@ -166,9 +169,19 @@ function setup_workers() { if [ "$2" = "MASTER" ]; then server_exec $node "sudo ${MASTER_LOGIN_TOKEN}" + server_exec $node "kubectl taint nodes \$(hostname) node-role.kubernetes.io/control-plane-" + server_exec $node "kubectl label nodes \$(hostname) loader-nodetype=master" echo "Backup master node $node has joined the cluster." else server_exec $node "sudo ${LOGIN_TOKEN}" + + if [ "$3" = "LOADER" ]; then + # First node after the control plane nodes + server_exec $node "kubectl label nodes \$(hostname) loader-nodetype=monitoring" < /dev/null + else + server_exec $node "kubectl label nodes \$(hostname) loader-nodetype=worker" < /dev/null + fi + echo "Worker node $node has joined the cluster." fi @@ -194,12 +207,16 @@ function setup_workers() { for node in "$@" do # Set up API Server load balancer arguments - Less than because 1 CP is the "main" master node already - HA_SETTING="" + HA_SETTING="OTHER" + LOADER_NODE="OTHER" + if [ "$NODE_COUNTER" -lt $CONTROL_PLANE_REPLICAS ]; then HA_SETTING="MASTER" + elif [ "$NODE_COUNTER" -eq $CONTROL_PLANE_REPLICAS ]; then + LOADER_NODE="LOADER" fi - internal_setup "$node" $HA_SETTING & + internal_setup "$node" "$HA_SETTING" "$LOADER_NODE" & let NODE_COUNTER++ done @@ -274,15 +291,17 @@ function copy_k8s_certificates() { shift # make argument list only contain worker nodes (drops master node) setup_master + + # Copy API server certificates from master to each worker node + copy_k8s_certificates "$@" + + # Join cluster setup_workers "$@" if [ $PODS_PER_NODE -gt 240 ]; then extend_CIDR "$@" fi - # Untaint master to schedule knative control plane there - server_exec $MASTER_NODE "kubectl taint nodes \$(hostname) node-role.kubernetes.io/control-plane-" - # Notify the master that all nodes have joined the cluster server_exec $MASTER_NODE 'tmux send -t master "y" ENTER' @@ -292,18 +311,8 @@ function copy_k8s_certificates() { namespace_info=$(server_exec $MASTER_NODE "kubectl get namespaces") done - echo "Master node $MASTER_NODE finalized." - - # Copy API server certificates from master to each worker node - copy_k8s_certificates "$@" - server_exec $MASTER_NODE 'cd loader; bash scripts/setup/patch_init_scale.sh' - source $DIR/label.sh - - # Force placement of metrics collectors and instrumentation on the loader node and control plane on master - label_nodes $MASTER_NODE $1 # loader node is second on the list, becoming first after arg shift - # patch knative to accept nodeselector server_exec $MASTER_NODE "cd loader; kubectl patch configmap config-features -n knative-serving -p '{\"data\": {\"kubernetes.podspec-nodeselector\": \"enabled\"}}'" From 4bae0c3b0d972eb7be5e167d6eb0814e9b825418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lazar=20Cvetkovi=C4=87?= Date: Fri, 22 Mar 2024 11:12:03 +0100 Subject: [PATCH 7/8] Fixing bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lazar Cvetković --- scripts/setup/create_multinode.sh | 4 ---- scripts/setup/setup.cfg | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/setup/create_multinode.sh b/scripts/setup/create_multinode.sh index ddf587532..4c1ff4e85 100755 --- a/scripts/setup/create_multinode.sh +++ b/scripts/setup/create_multinode.sh @@ -112,11 +112,7 @@ function setup_master() { server_exec $MASTER_NODE '~/loader/scripts/setup/rewrite_yaml_files.sh' -<<<<<<< HEAD - MN_CLUSTER="pushd ~/vhive/scripts > /dev/null && ./setup_tool create_multinode_cluster ${OPERATION_MODE} && popd > /dev/null" -======= MN_CLUSTER="pushd ~/vhive/scripts > /dev/null && ./setup_tool create_multinode_cluster ${OPERATION_MODE} ${CONTROL_PLANE_REPLICAS} && popd > /dev/null" ->>>>>>> 78e9b94 (K8s high-availability mode) server_exec "$MASTER_NODE" "tmux send -t master \"$MN_CLUSTER\" ENTER" # Get the join token from k8s. diff --git a/scripts/setup/setup.cfg b/scripts/setup/setup.cfg index dd5fe56fa..afbfafbf2 100755 --- a/scripts/setup/setup.cfg +++ b/scripts/setup/setup.cfg @@ -1,5 +1,5 @@ -VHIVE_BRANCH='main' -LOADER_BRANCH='main' +VHIVE_BRANCH='k8s_ha_mode' +LOADER_BRANCH='ha_k8s' CLUSTER_MODE='container' # choose from {container, firecracker, firecracker_snapshots} PODS_PER_NODE=240 DEPLOY_PROMETHEUS=false From 40546b63cade9113a8c27e5632f39b03aa38333c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lazar=20Cvetkovi=C4=87?= Date: Sun, 18 Aug 2024 21:02:30 +0200 Subject: [PATCH 8/8] Control plane replica set to 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lazar Cvetković --- scripts/setup/setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/setup/setup.cfg b/scripts/setup/setup.cfg index afbfafbf2..3f2321b43 100755 --- a/scripts/setup/setup.cfg +++ b/scripts/setup/setup.cfg @@ -5,4 +5,4 @@ PODS_PER_NODE=240 DEPLOY_PROMETHEUS=false # K8s control plane replicas (default: 1, for high-availability use 3 or 5) -CONTROL_PLANE_REPLICAS=3 +CONTROL_PLANE_REPLICAS=1