Skip to content

Commit

Permalink
Merge pull request rook#14842 from jhoblitt/bugfix/rgw-multisite-testing
Browse files Browse the repository at this point in the history
test: improve reliability of canary rgw-multisite-testing
  • Loading branch information
BlaineEXE authored Oct 14, 2024
2 parents 09562fd + 92d9f99 commit 9baf983
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 92 deletions.
71 changes: 69 additions & 2 deletions .github/workflows/canary-integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1528,8 +1528,75 @@ jobs:
- name: set Ceph version in CephCluster manifest
run: tests/scripts/github-action-helper.sh replace_ceph_image "deploy/examples/cluster-test.yaml" "${{ github.event.inputs.ceph-image }}"

- name: run RGW multisite test
uses: ./.github/workflows/rgw-multisite-test
- name: setup cluster resources
uses: ./.github/workflows/canary-test-config

- name: install additional deps for object testing
shell: bash --noprofile --norc -eo pipefail -x {0}
run: |
sudo apt-get install -y s3cmd
- name: use local disk into two partitions
shell: bash --noprofile --norc -eo pipefail -x {0}
run: |
export BLOCK="/dev/$(tests/scripts/github-action-helper.sh find_extra_block_dev)"
tests/scripts/github-action-helper.sh use_local_disk
tests/scripts/create-bluestore-partitions.sh --disk "$BLOCK" --osd-count 2
sudo lsblk
- name: deploy first cluster rook
shell: bash --noprofile --norc -eo pipefail -x {0}
run: |
tests/scripts/github-action-helper.sh deploy_first_rook_cluster
kubectl create -f deploy/examples/object-multisite-test.yaml
# wait for multisite-store to be created
tests/scripts/github-action-helper.sh wait_for cephobjectstore multisite-store rook-ceph 480
- name: prep second cluster pull realm config
shell: bash --noprofile --norc -eo pipefail -x {0}
run: |
cd deploy/examples/
IP_ADDR=$(kubectl -n rook-ceph get svc rook-ceph-rgw-multisite-store -o jsonpath="{.spec.clusterIP}")
yq w -i -d1 object-multisite-pull-realm-test.yaml spec.pull.endpoint http://${IP_ADDR}:80
BASE64_ACCESS_KEY=$(kubectl -n rook-ceph get secrets realm-a-keys -o jsonpath="{.data.access-key}")
BASE64_SECRET_KEY=$(kubectl -n rook-ceph get secrets realm-a-keys -o jsonpath="{.data.secret-key}")
sed -i 's/VzFjNFltMVdWRTFJWWxZelZWQT0=/'"$BASE64_ACCESS_KEY"'/g' object-multisite-pull-realm-test.yaml
sed -i 's/WVY1MFIxeExkbG84U3pKdlRseEZXVGR3T3k1U1dUSS9KaTFoUVE9PQ==/'"$BASE64_SECRET_KEY"'/g' object-multisite-pull-realm-test.yaml
- name: deploy second cluster rook
shell: bash --noprofile --norc -eo pipefail -x {0}
run: |
tests/scripts/github-action-helper.sh deploy_second_rook_cluster
kubectl create -f deploy/examples/object-multisite-pull-realm-test.yaml
# wait for realms to be pulled and zone-b-multisite-store to be created
tests/scripts/github-action-helper.sh wait_for cephobjectstore zone-b-multisite-store rook-ceph-secondary 480
- name: wait for both ceph clusters to be ready
shell: bash --noprofile --norc -eo pipefail -x {0}
run: |
tests/scripts/github-action-helper.sh wait_for cephcluster my-cluster rook-ceph
tests/scripts/github-action-helper.sh wait_for cephcluster my-cluster rook-ceph-secondary
- name: write an object to one cluster, read from the other
shell: bash --noprofile --norc -eo pipefail -x {0}
run: |
tests/scripts/github-action-helper.sh test_multisite_object_replication
# if this test fails, it could mean the RGW `period get` or `period update` output has changed
- name: RGW configuration period should be committed on first reconcile and not be committed on second reconcile
shell: bash --noprofile --norc -eo pipefail -x {0}
run: |
ns_name_primary_object_store='"rook-ceph/multisite-store"' # double quotes intended
ns_name_secondary_object_store='"rook-ceph-secondary/zone-b-multisite-store"' # double quotes intended
committed_msg="committing changes to RGW configuration period for CephObjectStore"
tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_primary_object_store}"
tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_secondary_object_store}"
ns_name_primary_object_zone='"rook-ceph/zone-a"' # double quotes intended
ns_name_secondary_object_zone='"rook-ceph-secondary/zone-b"' # double quotes intended
tests/scripts/github-action-helper.sh restart_operator
not_committed_msg="there are no changes to commit for RGW configuration period for CephObjectStore"
tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_primary_object_zone}" 600
tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_secondary_object_zone}" 600
- name: collect common logs
if: always()
Expand Down
75 changes: 0 additions & 75 deletions .github/workflows/rgw-multisite-test/action.yml

This file was deleted.

35 changes: 20 additions & 15 deletions tests/scripts/github-action-helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -476,23 +476,28 @@ function deploy_second_rook_cluster() {
deploy_toolbox
}

function wait_for_rgw() {
for _ in {1..120}; do
if [ "$(kubectl -n "$1" get pod -l app=rook-ceph-rgw --no-headers --field-selector=status.phase=Running | wc -l)" -ge 1 ]; then
echo "rgw pod is found"
break
function wait_for() {
local kind=${1?kind is required}
local name=${2?resource name is required}
local ns=${3:-rook-ceph}
local timeout=${4:-120}
local status=${5:-Ready}

local start_time="${SECONDS}"
local elapsed_time=0
while [[ $elapsed_time -lt $timeout ]]; do
if [[ "$(kubectl -n "$ns" get "$kind" "$name" -o 'jsonpath={..status.phase}')" == "$status" ]]; then
echo "${kind}/${name} in ${ns} is ${status} - elapsed time ${elapsed_time}s"
return 0
fi
echo "waiting for rgw pods"
sleep 5
done
for _ in {1..120}; do
if [ "$(kubectl -n "$1" get deployment -l app=rook-ceph-rgw -o yaml | yq read - 'items[0].status.readyReplicas')" -ge 1 ]; then
echo "rgw is ready"
break
fi
echo "waiting for rgw becomes ready"

elapsed_time=$((SECONDS - start_time))
echo "waiting for ${kind}/${name} in ${ns} to be ${status} - elapsed time ${elapsed_time}s"
sleep 5
done

echo "timed out waiting for ${kind}/${name} in ${ns} to be ${status} - elapsed time ${elapsed_time}s " >&2
exit 1
}

function verify_operator_log_message() {
Expand Down Expand Up @@ -561,7 +566,7 @@ function write_object_read_from_replica_cluster() {
# a direct sub-shell.
S3CMD_ERROR=0
(
sleep 60
sleep 300
kill -s SIGUSR1 $$
) 2>/dev/null &
trap "{ S3CMD_ERROR=1; break; }" SIGUSR1
Expand Down

0 comments on commit 9baf983

Please sign in to comment.