Skip to content

Commit

Permalink
Some minor fixes and refactor (#17)
Browse files Browse the repository at this point in the history
* fix typo for instance type option in usage function

* Add cloud provider option/argument and set it default for aws

This commit adds the cloud provider option for the user to set
different cloud provider and have `aws` as default. It also refactor
the script to put aws logic to that specific cloud provider block.

* Add UserKnownHostsFile=/dev/null to ssh options

It should avoid retry when same IP is part of known host file

```
Add correct host key in /home/prkumar/.ssh/known_hosts to get rid of this message.
Offending ED25519 key in /home/prkumar/.ssh/known_hosts:147
Password authentication is disabled to avoid man-in-the-middle attacks.
Keyboard-interactive authentication is disabled to avoid man-in-the-middle attacks.
UpdateHostkeys is disabled because the host key is not trusted.
```

* Use -q, quiet option for ssh command to avoid showing warning on terminal

* Patch machine config for update the ssh key

Till now machine config daemon was failing with error following

```
E1228 06:10:55.013342    7353 writer.go:200] Marking Degraded due to: failed to parse Ignition for validation: parsing Ignition config spec v3.2 failed with error: config is not valid
Report: error at line 9 col 603: json: cannot unmarshal string into Go struct field PasswdUser.passwd.users.sshAuthorizedKeys of type []types.SSHAuthorizedKey
```

Also even we update the `/etc/machine-config/currentconfig` then as soon
as the machine config operator start it replace the old ssh key. This PR
make sure that ssh key is updated as per machine config operator.

* Use GEN_PRIVATE_KEY for generated ssh key which is used for ssh

As part of 0e92e0cb1113eda6f0243ff990c36426d704c1e4 we now update the
ssh public key using machine config operator and because of that till
the cluster mco is patched properly generated key is not going to work
so better to use both for all ssh communication.
  • Loading branch information
praveenkumar authored Jan 13, 2023
1 parent c0fc289 commit eb78756
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 34 deletions.
6 changes: 3 additions & 3 deletions common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ wait_instance_readiness(){
}


get_instance_id() {
get_instance_id_aws() {
$JQ -r '.Instances[0].InstanceId' $1
}

get_instance_public_ip(){
get_instance_public_ip_aws(){
INSTANCE_IP=""
while [ -z $INSTANCE_IP ]
do
Expand All @@ -67,7 +67,7 @@ get_instance_public_ip(){
echo "$INSTANCE_IP"
}

get_instance_private_ip(){
get_instance_private_ip_aws(){
$JQ -r '.Instances[0].PrivateIpAddress' $1
}

91 changes: 64 additions & 27 deletions crc-cloud.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,30 +121,30 @@ swap_ssh_key() {
pr_info "swapping default key with the one just created"
$SSHKEYGEN -f $WORKDIR/id_rsa -q -N ''
stop_if_failed $? "failed to generate the key pair"
$SCP -o StrictHostKeychecking=no -P $SSH_PORT -i $PRIVATE_KEY $WORKDIR/id_rsa.pub core@$EIP:.
$SCP -q -o StrictHostKeychecking=no -o UserKnownHostsFile=/dev/null -P $SSH_PORT -i $PRIVATE_KEY $WORKDIR/id_rsa.pub core@$EIP:.
stop_if_failed $? "failed to upload the public key on the machine @ $EIP"
$SSH -o StrictHostKeychecking=no -p $SSH_PORT -i $PRIVATE_KEY core@$EIP "cat /home/core/id_rsa.pub > /home/core/.ssh/authorized_keys"
$SSH -q -o StrictHostKeychecking=no -o UserKnownHostsFile=/dev/null -p $SSH_PORT -i $PRIVATE_KEY core@$EIP "cat /home/core/id_rsa.pub > /home/core/.ssh/authorized_keys"
stop_if_failed $? "failed to swap the key on the authorized_keys @ $EIP"
#after swapping on VM private key is replaced by the new one
PRIVATE_KEY=$WORKDIR/id_rsa
GEN_PRIVATE_KEY=$WORKDIR/id_rsa
}

inject_and_run_cluster_setup() {
pr_info "injecting the setup script on the machine & running it [next logs will be remote]"
$SCP -o StrictHostKeychecking=no -P $SSH_PORT -i $PRIVATE_KEY $WORKDIR/cluster_setup.sh core@$EIP:/var/home/core/
$SSH -o StrictHostKeychecking=no -p $SSH_PORT -i $PRIVATE_KEY core@$EIP "chmod +x /var/home/core/cluster_setup.sh"
$SSH -o StrictHostKeychecking=no -p $SSH_PORT -i $PRIVATE_KEY core@$EIP "sudo /var/home/core/cluster_setup.sh"
$SCP -o StrictHostKeychecking=no -o UserKnownHostsFile=/dev/null -P $SSH_PORT -i $PRIVATE_KEY -i $GEN_PRIVATE_KEY $WORKDIR/cluster_setup.sh core@$EIP:/var/home/core/
$SSH -q -o StrictHostKeychecking=no -o UserKnownHostsFile=/dev/null -p $SSH_PORT -i $PRIVATE_KEY -i $GEN_PRIVATE_KEY core@$EIP "chmod +x /var/home/core/cluster_setup.sh"
$SSH -q -o StrictHostKeychecking=no -o UserKnownHostsFile=/dev/null -p $SSH_PORT -i $PRIVATE_KEY -i $GEN_PRIVATE_KEY core@$EIP "sudo /var/home/core/cluster_setup.sh"
}

tail_cluster_setup() {
pr_info "reading from VM logs from remote instance"
pr_info "waiting the log to be created, hang on...."
$SSH -o StrictHostKeychecking=no -p $SSH_PORT -i $PRIVATE_KEY core@$EIP "while [ ! -f /tmp/$RANDOM_SUFFIX.log ]; do sleep 1; done"
$SSH -q -o StrictHostKeychecking=no -o UserKnownHostsFile=/dev/null -p $SSH_PORT -i $PRIVATE_KEY -i $GEN_PRIVATE_KEY core@$EIP "while [ ! -f /tmp/$RANDOM_SUFFIX.log ]; do sleep 1; done"
pr_info "log detected, printing remote output on $EIP:"
PREVIOUS_LINE=""
while :
do
LINE=$($SSH -o StrictHostKeychecking=no -p $SSH_PORT -i $PRIVATE_KEY core@$EIP "sudo tail -n 1 /tmp/$RANDOM_SUFFIX.log")
LINE=$($SSH -q -o StrictHostKeychecking=no -o UserKnownHostsFile=/dev/null -p $SSH_PORT -i $PRIVATE_KEY -i $GEN_PRIVATE_KEY core@$EIP "sudo tail -n 1 /tmp/$RANDOM_SUFFIX.log")
stop_if_failed $? "impossible to get the logs from $EIP"
#xargs used to remove leading space
if [[ $LINE =~ "[ERR]" ]]
Expand All @@ -169,7 +169,7 @@ tail_cluster_setup() {

get_remote_log() {
pr_info "getting remote setup log"
$SCP -o StrictHostKeychecking=no -P $SSH_PORT -i $PRIVATE_KEY core@$EIP:/tmp/$RANDOM_SUFFIX.log $WORKDIR/remote.log
$SCP -o StrictHostKeychecking=no -o UserKnownHostsFile=/dev/null -P $SSH_PORT -i $PRIVATE_KEY -i $GEN_PRIVATE_KEY core@$EIP:/tmp/$RANDOM_SUFFIX.log $WORKDIR/remote.log
stop_if_failed $? "impossible to get the logs from $EIP"
}

Expand All @@ -184,11 +184,19 @@ set_cluster_infos() {
create () {
SECONDS=0
prepare_workdir
create_ec2_resources
case $CLOUD_PROVIDER in
"aws")
create_ec2_resources
INSTANCE_ID=`get_instance_id_aws $WORKDIR/$INSTANCE_DESCRIPTION`
IIP=`get_instance_private_ip_aws $WORKDIR/$INSTANCE_DESCRIPTION`
EIP=`get_instance_public_ip_aws $INSTANCE_ID`
;;
*)
echo "Unknown cloud provider"
usuage
exit 1
esac

INSTANCE_ID=`get_instance_id $WORKDIR/$INSTANCE_DESCRIPTION`
IIP=`get_instance_private_ip $WORKDIR/$INSTANCE_DESCRIPTION`
EIP=`get_instance_public_ip $INSTANCE_ID`

wait_instance_readiness $EIP
swap_ssh_key
Expand All @@ -205,7 +213,15 @@ create () {
teardown() {
WORKDIR="$WORKDIR_PATH/$TEARDOWN_RUN_ID"
[ ! -d $WORKDIR ] && stop_if_failed 1 "$WORKDIR not found, please provide a correct path"
destroy_ec2_resources
case $CLOUD_PROVIDER in
"aws")
destroy_ec2_resources
;;
*)
echo "Unknown cloud provider"
exit 1
;;
esac
}

set_workdir_dependent_variables() {
Expand All @@ -223,21 +239,23 @@ usage() {
usage="
Cluster Creation :
$(basename "$0") -C -p pull secret path [-d developer user password] [-k kubeadmin user password] [-r redhat user password] [-a AMI ID] [-t Instance type]
$(basename "$0") -C -p pull secret path [-i cloud provider] [-d developer user password] [-k kubeadmin user password] [-r redhat user password] [-a AMI ID] [-t Instance type]
where:
-C Cluster Creation mode
-i Cloud/Infra provider (optional, default: $CLOUD_PROVIDER)
-p pull secret file path (download from https://console.redhat.com/openshift/create/local)
-d developer user password (optional, default: $PASS_DEVELOPER)
-k kubeadmin user password (optional, default: $PASS_KUBEADMIN)
-r redhat user password (optional, default: $PASS_REDHAT)
-a AMI ID (Amazon Machine Image) from which the VM will be Instantiated (optional, default: $AMI_ID)
-i EC2 Instance Type (optional, default; $INSTANCE_TYPE)
-a Image ID (Cloud provider Machine Image) from which the VM will be Instantiated (optional, default: $AMI_ID)
-t Cloud provider Instance Type (optional, default; $INSTANCE_TYPE)
-h show this help text
Cluster Teardown:
$(basename "$0") -T [-v run id]
$(basename "$0") -T [-i cloud provider] [-v run id]
-T Cluster Teardown mode
-i Cloud/Infra provider (optional, default: $CLOUD_PROVIDER)
-v The Id of the run that is gonna be destroyed, corresponds with the numeric name of the folders created in workdir (optional, default: latest)
-h show this help text
"
Expand All @@ -254,8 +272,6 @@ JQ=`which jq 2>/dev/null`
[[ $? != 0 ]] && stop_if_failed 1 "[DEPENDENCY MISSING]: jq, please install it and try again"
MD5SUM=`which md5sum 2>/dev/null`
[[ $? != 0 ]] && stop_if_failed 1 "[DEPENDENCY MISSING]: md5sum, please install it and try again"
AWS=`which aws 2>/dev/null`
[[ $? != 0 ]] && stop_if_failed 1 "[DEPENDENCY MISSING]: aws cli, please install it and try again"
HEAD=`which head 2>/dev/null`
[[ $? != 0 ]] && stop_if_failed 1 "[DEPENDENCY MISSING]: head, please install it and try again"
SSHKEYGEN=`which ssh-keygen 2>/dev/null`
Expand All @@ -273,14 +289,13 @@ BASE64=`which base64 2>/dev/null`
FIGLET=`which figlet 2>/dev/null`
[[ $CONTAINER && ( $? != 0 ) ]] && stop_if_failed 1 "[DEPENDENCY MISSING]: figlet (container mode only), please install it and try again"



##CONST
SSH_PORT="22"
RUN_TIMESTAMP=`date +%s`
INSTANCE_DESCRIPTION="instance_description.json"
RANDOM_SUFFIX=`echo $RANDOM | $MD5SUM | $HEAD -c 8`
PRIVATE_KEY="id_ecdsa_crc"
GEN_PRIVATE_KEY=""
CLUSTER_INFOS_FILE="cluster_infos.json"
TEMPLATES="templates"
TEARDOWN_MAX_RETRIES=500
Expand All @@ -295,6 +310,7 @@ AMI_ID="ami-0569ce8a44f2351be"
[ -z $WORKDIR_PATH ] && WORKDIR_PATH="workdir"
[ -z $WORKING_MODE ] && WORKING_MODE=""
[ -z $TEARDOWN_RUN_ID ] && TEARDOWN_RUN_ID="latest"
[ -z $CLOUD_PROVIDER ] && CLOUD_PROVIDER="aws"
[ -z $CREATE_RUN_ID ] && CREATE_RUN_ID=$RUN_TIMESTAMP

##ARGS
Expand All @@ -313,14 +329,23 @@ then
[[ ! -d $WORKDIR_PATH ]] && stop_if_failed 1 "please mount the workdir filesystem, refer to README.md for further instructions"
[[ ! -w $WORKDIR_PATH ]] && \
stop_if_failed 1 "please grant write permissions to the host folder mounted as volume, please refer to README.md for further instructions"
#check AWS credentials
[[ -z $AWS_ACCESS_KEY_ID ]] && stop_if_failed 1 "AWS_ACCESS_KEY_ID must be set, please refer to AWS CLI documentation https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html"
[[ -z $AWS_SECRET_ACCESS_KEY ]] && stop_if_failed 1 "AWS_ACCESS_KEY_ID must be set, please refer to AWS CLI documentation https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html"
[[ -z $AWS_DEFAULT_REGION ]] && stop_if_failed 1 "AWS_ACCESS_KEY_ID must be set, please refer to AWS CLI documentation https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html"
#check the cloud provider env and set env variable accordingly
case $CLOUD_PROVIDER in
"aws")
#check AWS credentials
[[ -z $AWS_ACCESS_KEY_ID ]] && stop_if_failed 1 "AWS_ACCESS_KEY_ID must be set, please refer to AWS CLI documentation https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html"
[[ -z $AWS_SECRET_ACCESS_KEY ]] && stop_if_failed 1 "AWS_ACCESS_KEY_ID must be set, please refer to AWS CLI documentation https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html"
[[ -z $AWS_DEFAULT_REGION ]] && stop_if_failed 1 "AWS_ACCESS_KEY_ID must be set, please refer to AWS CLI documentation https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html"
;;
*)
echo "Unknown provider"
exit 1
;;
esac
[[ ( $WORKING_MODE == "T" ) && ($TEARDOWN_RUN_ID == "latest") ]] && stop_if_failed 1 "TEARDOWN_RUN_ID must be set in container mode. Please set this value with the run id that you want to teardown, refer to README.md for further instructions"
else
set_workdir_dependent_variables
options=':h:CTp:d:k:r:a:t:v:'
options=':h:CTp:d:k:r:a:t:v:i:'
while getopts $options option; do
case "$option" in
h) usage;;
Expand All @@ -333,6 +358,7 @@ else
a) AMI_ID=$OPTARG;;
t) INSTANCE_TYPE=$OPTARG;;
v) TEARDOWN_RUN_ID=$OPTARG;;
i) CLOUD_PROVIDER=$OPTARG;;
:) printf "missing argument for -%s\n" "$OPTARG" >&2; usage;;
\?) printf "illegal option: -%s\n" "$OPTARG" >&2; usage;;
esac
Expand All @@ -341,6 +367,17 @@ else
##VARIABLE SANITY CHECKS

#WORKING MODE CHECK
case $CLOUD_PROVIDER in
"aws")
AWS=`which aws 2>/dev/null`
[[ $? != 0 ]] && stop_if_failed 1 "[DEPENDENCY MISSING]: aws cli, please install it and try again"
;;
*)
echo "unknown cloud provider"
usage
exit 1
esac

[[ (-z $WORKING_MODE ) ]] && echo -e "\nERROR: Working mode must be set\n" && usage
[[ ( $WORKING_MODE != "C" ) && ( $WORKING_MODE != "T" ) ]] && echo \
-e "\nERROR: Working mode Must be either -C (creation) or -T (teardown), not $WORKING_MODE\n" && usage
Expand Down
23 changes: 19 additions & 4 deletions templates/cluster_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,26 @@ check_cluster_access_with_new_ca() {
done
}

wait_for_resource() {
local resource=$1
local retry=0
local max_retry=20
until `oc get $resource > /dev/null 2>&1`
do
[[$retry == $max_retry]] && stop_if_failed 1 "impossible to get resource ${resource}"
pr_info "waiting for ${resource} to become available try $retry, hang on...."
sleep 5
((retry++))
done
}

#Replaces the default pubkey with the new one just generated to avoid the mysterious service to replace it later on :-\
replace_default_pubkey() {
pr_info "replacing the default public key from /etc/machine-config-daemon/currentconfig"
cat <<< $(jq --arg pubkey "$(cat /home/core/id_rsa.pub)" '.spec.config.passwd.users[0].sshAuthorizedKeys=$pubkey' /etc/machine-config-daemon/currentconfig) > /etc/machine-config-daemon/currentconfig
stop_if_failed $? "failed to replace public key"
pr_info "Updating the public key resource for machine config operator"
local pub_key=$(tr -d '\n\r' < /home/core/id_rsa.pub)
wait_for_resource machineconfig
oc patch machineconfig 99-master-ssh -p "{\"spec\": {\"config\": {\"passwd\": {\"users\": [{\"name\": \"core\", \"sshAuthorizedKeys\": [\"${pub_key}\"]}]}}}}" --type merge
stop_if_failed $? "failed to update public key to machine config operator"
}

setup_dsnmasq(){
Expand Down Expand Up @@ -237,10 +252,10 @@ set_credentials() {
stop_if_failed $? "failed to replace Cluster secret"
}

replace_default_pubkey
setup_dsnmasq

enable_and_start_kubelet
replace_default_pubkey
wait_cluster_become_healthy "etcd|openshift-apiserver"
stop_if_failed $? "failed to recover Cluster after $(expr $CLUSTER_HEALTH_RETRIES \* $CLUSTER_HEALTH_SLEEP) seconds"

Expand Down

0 comments on commit eb78756

Please sign in to comment.