vhive-serverless · cvetkovic · Mar 21, 2024 · Mar 21, 2024 · Feb 19, 2024 · Feb 19, 2024
diff --git a/config/kube.json b/config/kube.json
@@ -1,10 +1,14 @@
 {
 	"K8sVersion": "1.29.1",
-	"AlternativeImageRepo": "",
-	"ApiserverAdvertiseAddress": "",
-	"PodNetworkCidr": "10.168.0.0/16",
-	"ApiserverPort": "6443",
-	"ApiserverToken": "",
-	"ApiserverTokenHash": "",
-	"CalicoVersion": "3.27.2"
+	"CalicoVersion": "3.27.2",
+    "AlternativeImageRepo": "",
+    "ApiserverAdvertiseAddress": "",
+    "PodNetworkCidr": "10.168.0.0/16",
+    "PodNetworkAddonConfigURL": "https://raw.githubusercontent.com/vhive-serverless/vHive/main/configs/calico/canal.yaml",
+    "ApiserverPort": "6443",
+    "ApiserverToken": "",
+    "ApiserverDiscoveryToken": "",
+    "ApiserverCertificateKey": "",
+    "CPHAEndpoint": "10.0.1.254",
+    "CPHAPort": "6443"
 }
diff --git a/docs/loader.md b/docs/loader.md
@@ -12,9 +12,10 @@ can choose the APT cluster `d430` node.
 ### vHive cluster
 
 First, configure `script/setup/setup.cfg`. You can specify there which vHive branch to use, loader branch, operation
-mode (sandbox type), and maximum number of pods per node. All these configurations are mandatory. We currently support
+mode (sandbox type), maximum number of pods per node, whether to deploy Prometheus metrics collector, and the number of
+Kubernetes control plane replicas (for high-availability). All these configurations are mandatory. We currently support
 the following modes: containerd (`container`), Firecracker (`firecracker`), and Firecracker with
-snapshots (`firecracker_snapshots`).Loader will be cloned on every node specified as argument of the cluster create
+snapshots (`firecracker_snapshots`). Loader will be cloned on every node specified as argument of the cluster create
 script. The same holds for Kubernetes API server certificate.
 
 * To create a multi-node cluster, specify the node addresses as the arguments and run the following command:
@@ -124,9 +125,10 @@ For to configure the workload for load generator, please refer to `docs/configur
 There are a couple of constants that should not be exposed to the users. They can be examined and changed
 in `pkg/common/constants.go`.
 
-Sample sizes appropriate for performance evaluation vary depending on the platform. 
-As a starting point for fine-tuning, we suggest at most 5 functions per core with SMT disabled. 
-For example, 80 functions for a 16-core node. With larger sample sizes, trace replaying may lead to failures in function invocations.
+Sample sizes appropriate for performance evaluation vary depending on the platform.
+As a starting point for fine-tuning, we suggest at most 5 functions per core with SMT disabled.
+For example, 80 functions for a 16-core node. With larger sample sizes, trace replaying may lead to failures in function
+invocations.
 
 ## Build the image for a synthetic function
 
@@ -147,7 +149,7 @@ For testing cold start performance:
 $ make <trace-firecracker|trace-container|empty-firecracker|empty-container>
 ```
 
-Pushing the images will require a write access to Github packages connected to this repository. Please refer to 
+Pushing the images will require a write access to Github packages connected to this repository. Please refer to
 [this guide](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-with-a-personal-access-token-classic)
 for authentication instructions.
 
@@ -184,15 +186,23 @@ For instructions on how to use the loader with OpenWhisk go to `openwhisk_setup/
 Currently supported vendors: AWS
 
 Setup Instructions:
+
 1. Sign up for a Serverless account [here](https://app.serverless.com/)
-2. Install Serverless framework via command line `npm install -g serverless` to allow our Go code to interact with the Serverless CLI framework e.g. `sls deploy`
-3. Follow their [setup guide](https://www.serverless.com/framework/docs/getting-started) to link the respective cloud providers
+2. Install Serverless framework via command line `npm install -g serverless` to allow our Go code to interact with the
+   Serverless CLI framework e.g. `sls deploy`
+3. Follow their [setup guide](https://www.serverless.com/framework/docs/getting-started) to link the respective cloud
+   providers
 
 To run the default Loader:
+
 1. Change the `Platform` value in `cmd/config.json` to one of those specified in `docs/configuration.md`
-2. ~~Build the Go binary in Linux: `go build -v -o ./server/trace-func-go/aws/trace_func ./server/trace-func-go/aws/trace_func.go` (For Window users, please use WSL to build the binary)~~
+2. ~~Build the Go binary in
+   Linux: `go build -v -o ./server/trace-func-go/aws/trace_func ./server/trace-func-go/aws/trace_func.go` (For Window
+   users, please use WSL to build the binary)~~
 3. Start the experiments: `go run cmd/loader.go --config cmd/config.json --verbosity trace`
 
 Note:
-- Current deployment is via zip file where the Go binary has to be built prior to running the code. For now, the Go binary has been packaged in this repo. Otherwise, refer to Step 2 above to build your custom Go binary.
+
+- Current deployment is via zip file where the Go binary has to be built prior to running the code. For now, the Go
+  binary has been packaged in this repo. Otherwise, refer to Step 2 above to build your custom Go binary.
 - Refer to [Single Execution](#single-execution) section for more details on the experiment configurations.
diff --git a/scripts/create_dirigent_config.py b/scripts/create_dirigent_config.py
@@ -0,0 +1,23 @@
+import os
+import pandas as pd
+
+path = '/home/lcvetkovic/Downloads/day6_hour8_samples/samples'
+
+for files in os.listdir(path):
+    df = pd.read_csv(f'{path}/{files}/invocations.csv')
+    dirigent = pd.DataFrame()
+
+    for index, row in df.iterrows():
+        function = row['HashFunction']
+
+        dirigent = dirigent.append( {
+            'HashFunction': function,
+            'Image': 'docker.io/cvetkovic/dirigent_trace_function:latest',
+            'Port': 80,
+            'Protocol': 'tcp',
+            'ScalingUpperBound': 10000,
+            'ScalingLowerBound': 0,
+            'IterationMultiplier': 155,
+        }, ignore_index=True)
+
+    dirigent.to_csv(f'{path}/{files}/dirigent.csv', index=False)
diff --git a/scripts/setup/create_multinode.sh b/scripts/setup/create_multinode.sh
@@ -52,6 +52,16 @@ if [ $PODS_PER_NODE -gt 1022 ]; then
     exit 1
 fi
 
+if [ "$#" -lt $CONTROL_PLANE_REPLICAS ]; then
+    echo "Not enough nodes to set up the requested number of control plane replicas."
+    exit 1
+fi
+
+if [ "$CONTROL_PLANE_REPLICAS" != 1 ] && [ "$CONTROL_PLANE_REPLICAS" != 3 ] && [ "$CONTROL_PLANE_REPLICAS" != 5 ]; then
+    echo "Number of control plane replicas can only be 1, 3, or 5."
+    exit 1
+fi
+
 server_exec() {
     ssh -oStrictHostKeyChecking=no -p 22 "$1" "$2";
 }
@@ -60,7 +70,7 @@ common_init() {
     internal_init() {
         server_exec $1 "git clone --branch=$VHIVE_BRANCH https://github.com/vhive-serverless/vhive"
 
-        server_exec $1 "pushd ~/vhive/scripts > /dev/null && ./install_go.sh && source /etc/profile && go build -o setup_tool && ./setup_tool setup_node ${OPERATION_MODE} && popd > /dev/null"
+        server_exec $1 "pushd ~/vhive/scripts > /dev/null && ./install_go.sh && source /etc/profile && go build -o setup_tool && ./setup_tool setup_node $2 ${OPERATION_MODE}  && popd > /dev/null"
 
         server_exec $1 'tmux new -s containerd -d'
         server_exec $1 'tmux send -t containerd "sudo containerd 2>&1 | tee ~/containerd_log.txt" ENTER'
@@ -72,12 +82,22 @@ common_init() {
         server_exec $1 'sudo chronyc tracking'
 
         clone_loader $1
-        server_exec $1 '~/loader/scripts/setup/stabilize.sh'
+        #server_exec $1 '~/loader/scripts/setup/stabilize.sh'
     }
 
+    NODE_COUNTER=1
     for node in "$@"
     do
-        internal_init "$node" &
+        # Set up API Server load balancer arguments
+        HA_SETTING="REGULAR"
+        if [ "$NODE_COUNTER" -eq 1 ]; then
+            HA_SETTING="MASTER"
+        elif [ "$NODE_COUNTER" -le $CONTROL_PLANE_REPLICAS ]; then
+            HA_SETTING="BACKUP"
+        fi
+
+        internal_init "$node" $HA_SETTING &
+        let NODE_COUNTER++
     done
 
     wait
@@ -92,20 +112,31 @@ function setup_master() {
 
     server_exec $MASTER_NODE '~/loader/scripts/setup/rewrite_yaml_files.sh'
 
-    MN_CLUSTER="pushd ~/vhive/scripts > /dev/null && ./setup_tool create_multinode_cluster ${OPERATION_MODE} && popd > /dev/null"
+    MN_CLUSTER="pushd ~/vhive/scripts > /dev/null && ./setup_tool create_multinode_cluster ${OPERATION_MODE} ${CONTROL_PLANE_REPLICAS} && popd > /dev/null"
     server_exec "$MASTER_NODE" "tmux send -t master \"$MN_CLUSTER\" ENTER"
 
     # Get the join token from k8s.
     while ! server_exec "$MASTER_NODE" "[ -e ~/vhive/scripts/masterKey.yaml ]"; do
         sleep 1
     done
 
+    MASTER_LOGIN_TOKEN=$(server_exec "$MASTER_NODE" \
+        'awk '\''/^ApiserverAdvertiseAddress:/ {ip=$2} \
+        /^ApiserverPort:/ {port=$2} \
+        /^ApiserverToken:/ {token=$2} \
+        /^ApiserverDiscoveryToken:/ {discovery_token=$2} \
+        /^ApiserverCertificateKey:/ {certificate_key=$2} \
+        END {print "sudo kubeadm join " ip ":" port " --token " token " --discovery-token-ca-cert-hash " discovery_token " --control-plane --certificate-key " certificate_key}'\'' ~/vhive/scripts/masterKey.yaml')
+
     LOGIN_TOKEN=$(server_exec "$MASTER_NODE" \
         'awk '\''/^ApiserverAdvertiseAddress:/ {ip=$2} \
         /^ApiserverPort:/ {port=$2} \
         /^ApiserverToken:/ {token=$2} \
-        /^ApiserverTokenHash:/ {token_hash=$2} \
+        /^ApiserverDiscoveryToken:/ {token_hash=$2} \
         END {print "sudo kubeadm join " ip ":" port " --token " token " --discovery-token-ca-cert-hash " token_hash}'\'' ~/vhive/scripts/masterKey.yaml')
+
+    server_exec $MASTER_NODE "kubectl taint nodes \$(hostname) node-role.kubernetes.io/control-plane-"
+    server_exec $MASTER_NODE "kubectl label nodes \$(hostname) loader-nodetype=master"
 }
 
 function setup_vhive_firecracker_daemon() {
@@ -132,25 +163,57 @@ function setup_workers() {
             setup_vhive_firecracker_daemon $node
         fi
 
-        server_exec $node "sudo ${LOGIN_TOKEN}"
-        echo "Worker node $node has joined the cluster."
+        if [ "$2" = "MASTER" ]; then
+            server_exec $node "sudo ${MASTER_LOGIN_TOKEN}"
+            server_exec $node "kubectl taint nodes \$(hostname) node-role.kubernetes.io/control-plane-"
+            server_exec $node "kubectl label nodes \$(hostname) loader-nodetype=master"
+            echo "Backup master node $node has joined the cluster."
+        else
+            server_exec $node "sudo ${LOGIN_TOKEN}"
+
+            if [ "$3" = "LOADER" ]; then
+                # First node after the control plane nodes
+                server_exec $node "kubectl label nodes \$(hostname) loader-nodetype=monitoring" < /dev/null
+            else
+                server_exec $node "kubectl label nodes \$(hostname) loader-nodetype=worker" < /dev/null
+            fi
+
+            echo "Worker node $node has joined the cluster."
+        fi
 
         # Stretch the capacity of the worker node to 240 (k8s default: 110)
         # Empirically, this gives us a max. #pods being 240-40=200
-        echo "Stretching node capacity for $node."
-        server_exec $node "echo \"maxPods: ${PODS_PER_NODE}\" > >(sudo tee -a /var/lib/kubelet/config.yaml >/dev/null)"
-        server_exec $node "echo \"containerLogMaxSize: 512Mi\" > >(sudo tee -a /var/lib/kubelet/config.yaml >/dev/null)"
-        server_exec $node 'sudo systemctl restart kubelet'
-        server_exec $node 'sleep 10'
+        #echo "Stretching node capacity for $node."
+        #server_exec $node "echo \"maxPods: ${PODS_PER_NODE}\" > >(sudo tee -a /var/lib/kubelet/config.yaml >/dev/null)"
+        #server_exec $node "echo \"containerLogMaxSize: 512Mi\" > >(sudo tee -a /var/lib/kubelet/config.yaml >/dev/null)"
+        #server_exec $node 'sudo systemctl restart kubelet'
+        #server_exec $node 'sleep 10'
 
         # Rejoin has to be performed although errors will be thrown. Otherwise, restarting the kubelet will cause the node unreachable for some reason
-        server_exec $node "sudo ${LOGIN_TOKEN} > /dev/null 2>&1"
-        echo "Worker node $node joined the cluster (again :P)."
+        #if [ $2 -eq "MASTER" ]; then
+        #    server_exec $node "sudo ${MASTER_LOGIN_TOKEN} > /dev/null 2>&1"
+        #    echo "Backup master node $node joined the cluster (again :P)."
+        #else
+        #    server_exec $node "sudo ${LOGIN_TOKEN} > /dev/null 2>&1"
+        #    echo "Worker node $node joined the cluster (again :P)."
+        #fi
     }
 
+    NODE_COUNTER=1
     for node in "$@"
     do
-        internal_setup "$node" &
+        # Set up API Server load balancer arguments - Less than because 1 CP is the "main" master node already
+        HA_SETTING="OTHER"
+        LOADER_NODE="OTHER"
+
+        if [ "$NODE_COUNTER" -lt $CONTROL_PLANE_REPLICAS ]; then
+            HA_SETTING="MASTER"
+        elif [ "$NODE_COUNTER" -eq $CONTROL_PLANE_REPLICAS ]; then
+            LOADER_NODE="LOADER"
+        fi
+
+        internal_setup "$node" "$HA_SETTING" "$LOADER_NODE" &
+        let NODE_COUNTER++
     done
 
     wait
@@ -224,15 +287,17 @@ function copy_k8s_certificates() {
     shift # make argument list only contain worker nodes (drops master node)
 
     setup_master
+
+    # Copy API server certificates from master to each worker node
+    copy_k8s_certificates "$@"
+
+    # Join cluster
     setup_workers "$@"
 
     if [ $PODS_PER_NODE -gt 240 ]; then
         extend_CIDR "$@"
     fi
 
-    # Untaint master to schedule knative control plane there
-    server_exec $MASTER_NODE "kubectl taint nodes \$(hostname) node-role.kubernetes.io/control-plane-"
-
     # Notify the master that all nodes have joined the cluster
     server_exec $MASTER_NODE 'tmux send -t master "y" ENTER'
 
@@ -242,18 +307,8 @@ function copy_k8s_certificates() {
         namespace_info=$(server_exec $MASTER_NODE "kubectl get namespaces")
     done
 
-    echo "Master node $MASTER_NODE finalised."
-
-    # Copy API server certificates from master to each worker node
-    copy_k8s_certificates "$@"
-
     server_exec $MASTER_NODE 'cd loader; bash scripts/setup/patch_init_scale.sh'
 
-    source $DIR/label.sh
-
-    # Force placement of metrics collectors and instrumentation on the loader node and control plane on master
-    label_nodes $MASTER_NODE $1 # loader node is second on the list, becoming first after arg shift
-
     # patch knative to accept nodeselector
     server_exec $MASTER_NODE "cd loader; kubectl patch configmap config-features -n knative-serving -p '{\"data\": {\"kubernetes.podspec-nodeselector\": \"enabled\"}}'"
 

diff --git a/scripts/setup/rewrite_yaml_files.sh b/scripts/setup/rewrite_yaml_files.sh
@@ -46,18 +46,6 @@ sed -e '$d' > net-istio-yq.yaml
 # serving-core.yaml
 
 cat serving-core.yaml |
-    yq '
-    (
-        select
-        (
-               .spec.template.metadata.labels.app == "activator"
-            or .spec.template.metadata.labels.app == "autoscaler"
-            or .spec.template.metadata.labels.app == "controller"
-            or .spec.template.metadata.labels.app == "domain-mapping"
-            or .spec.template.metadata.labels.app == "domainmapping-webhook"
-            or .spec.template.metadata.labels.app == "webhook"
-        ) | .spec.template.spec 
-    ) += {"nodeSelector": {"loader-nodetype": "master"}}' |
     yq '
     (
         del

diff --git a/scripts/setup/setup.cfg b/scripts/setup/setup.cfg
@@ -1,5 +1,8 @@
-VHIVE_BRANCH='v1.7'
-LOADER_BRANCH='main'
+VHIVE_BRANCH='k8s_ha_mode'
+LOADER_BRANCH='ha_k8s'
 CLUSTER_MODE='container' # choose from {container, firecracker, firecracker_snapshots}
 PODS_PER_NODE=240
 DEPLOY_PROMETHEUS=false
+
+# K8s control plane replicas (default: 1, for high-availability use 3 or 5)
+CONTROL_PLANE_REPLICAS=1
diff --git a/scripts/trigger_failures/delete_control_plane.sh b/scripts/trigger_failures/delete_control_plane.sh
@@ -0,0 +1,8 @@
+for s in $(kubectl get pods -n knative-serving -o name | cut -c 5- | grep -v activator);
+do
+  kubectl delete pod $s -n knative-serving &
+done
+for s in $(kubectl get pods -n kube-system -o name | cut -c 5-);
+do
+  kubectl delete pod $s -n kube-system &
+done
diff --git a/scripts/trigger_failures/delete_data_plane.sh b/scripts/trigger_failures/delete_data_plane.sh
@@ -0,0 +1,8 @@
+for s in $(kubectl get pods -n knative-serving -o name | cut -c 5- | grep activator);
+do
+  kubectl delete pod $s -n knative-serving &
+done
+for s in $(kubectl get pods -n istio-system -o name | cut -c 5- );
+do
+  kubectl delete pod $s -n istio-system &
+done