diff --git a/Makefile b/Makefile index da9787d76..9cb33d64b 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,10 @@ test-with-ceph: mkdir -p tests/logs ANSIBLE_CONFIG=$(TEST_CONFIG) ansible-playbook -v -i $(TEST_INVENTORY) -e @$(TEST_VARS) -e @$(TEST_SECRETS) tests/playbooks/test_with_ceph.yaml 2>&1 | tee $(TEST_OUTFILE) +test-swift-migration: TEST_OUTFILE := tests/logs/test_swift_migration_out_$(shell date +%FT%T%Z).log +test-swift-migration: + mkdir -p tests/logs + ANSIBLE_CONFIG=$(TEST_CONFIG) ansible-playbook -v -i $(TEST_INVENTORY) -e @$(TEST_VARS) -e @$(TEST_SECRETS) tests/playbooks/test_swift_migration.yaml 2>&1 | tee $(TEST_OUTFILE) ### DOCS ### diff --git a/docs_user/assemblies/ceph_migration.adoc b/docs_user/assemblies/ceph_migration.adoc index b0ae0c2ad..17df37387 100644 --- a/docs_user/assemblies/ceph_migration.adoc +++ b/docs_user/assemblies/ceph_migration.adoc @@ -12,6 +12,7 @@ ifdef::context[:parent-context: {context}] include::../modules/ceph-rbd_migration.adoc[leveloffset=+1] include::../modules/ceph-rgw_migration.adoc[leveloffset=+1] include::../modules/ceph-mds_migration.adoc[leveloffset=+1] +include::../modules/ceph-monitoring_migration.adoc[leveloffset=+1] ifdef::parent-context[:context: {parent-context}] ifndef::parent-context[:!context:] diff --git a/docs_user/assemblies/swift_migration.adoc b/docs_user/assemblies/swift_migration.adoc new file mode 100644 index 000000000..c2bd28876 --- /dev/null +++ b/docs_user/assemblies/swift_migration.adoc @@ -0,0 +1,15 @@ +ifdef::context[:parent-context: {context}] + +[id="swift-migration_{context}"] + += Object Storage data migration + +:context: swift-migration + +:toc: left +:toclevels: 3 + +include::../modules/openstack-swift_migration.adoc[leveloffset=+1] + +ifdef::parent-context[:context: {parent-context}] +ifndef::parent-context[:!context:] diff --git a/docs_user/main.adoc b/docs_user/main.adoc index 0dd5b91cc..d4aa19256 100644 --- a/docs_user/main.adoc +++ b/docs_user/main.adoc @@ -9,3 +9,5 @@ include::assemblies/openstack_adoption.adoc[leveloffset=+1] include::assemblies/ceph_migration.adoc[leveloffset=+1] + +include::assemblies/swift_migration.adoc[leveloffset=+1] diff --git a/docs_user/modules/ceph-mds_migration.adoc b/docs_user/modules/ceph-mds_migration.adoc index f6028f01b..9d9fad1f8 100644 --- a/docs_user/modules/ceph-mds_migration.adoc +++ b/docs_user/modules/ceph-mds_migration.adoc @@ -122,7 +122,7 @@ Check the current OSD blocklist and clean up the client list: [ceph: root@controller-0 /]# ceph osd blocklist ls .. .. -for item in $(ceph osd blocklist ls | awk ‘{print $0}’); do +for item in $(ceph osd blocklist ls | awk '{print $0}'); do ceph osd blocklist rm $item; done ---- diff --git a/docs_user/modules/ceph-monitoring_migration.adoc b/docs_user/modules/ceph-monitoring_migration.adoc new file mode 100644 index 000000000..d1457b4d9 --- /dev/null +++ b/docs_user/modules/ceph-monitoring_migration.adoc @@ -0,0 +1,420 @@ +[id="migrating-ceph-monitoring_{context}"] + +//:context: migrating-ceph-monitoring +//kgilliga: This module might be converted to an assembly. + += Migrating Ceph Monitoring Stack + +In the context of data plane adoption, where the OpenStack services are +redeployed in OpenShift, a TripleO-deployed Ceph cluster will undergo a +migration in a process we are calling “externalizing” the Ceph cluster. +There are two deployment topologies, broadly, that include an “internal” Ceph +cluster today: one is where OpenStack includes dedicated Storage nodes to host +OSDs, and the other is Hyperconverged Infrastructure (HCI) where Compute nodes +double up as Storage nodes. In either scenario, there are some Ceph processes +that are deployed on OpenStack Controller nodes: Ceph monitors, rgw, rdb, mds, +ceph dashboard and nfs-ganesha. +The Ceph Dashboard module adds web-based monitoring and administration to the +Ceph Manager. +With director deployed Ceph this component is enabled as part of the overcloud +deploy and it’s composed by: + +- Ceph mgr module +- Grafana +- Prometheus +- Alertmanager +- Node exporter + +The Ceph Dashboard containers are included via `tripleo-container-image-prepare` +parameters and the high availability relies on `Haproxy` and `Pacemaker` +deployed on the OpenStack front. +For an external Ceph cluster, High availability is not supported and the work +is tracked in the https://bugzilla.redhat.com/show_bug.cgi?id=1902212[associated RHCS bugzilla]. +The goal of this procedure is to migrate and relocate the Ceph Monitoring +components to free controller nodes. + + +== Requirements + +For this procedure, we assume that we are beginning with a OpenStack based on +Wallaby and a Ceph Reef deployment managed by TripleO. +We assume that: + +* Ceph has been upgraded to Reef and is managed by cephadm/orchestrator +* Both the Ceph public and cluster networks are propagated, via TripleO, to the + target nodes + +== Gather the current status of the Monitoring stack + +Before starting the relocation of the monitoring stack components, verify that +the hosts have no `monitoring` label (or `grafana`, `prometheus`, `alertmanager` +in case of a per daemons placement evaluation) associated. +The entire relocation process is driven by cephadm and relies on **labels** to be +assigned to the target nodes, where the daemons are scheduled. Make sure to +review the https://access.redhat.com/articles/1548993[cardinality matrix] +before assigning labels and choose carefully the nodes where the monitoring +stack components should be scheduled on. + + +[source,bash] +---- +[tripleo-admin@controller-0 ~]$ sudo cephadm shell -- ceph orch host ls + +HOST ADDR LABELS STATUS +cephstorage-0.redhat.local 192.168.24.11 osd mds +cephstorage-1.redhat.local 192.168.24.12 osd mds +cephstorage-2.redhat.local 192.168.24.47 osd mds +controller-0.redhat.local 192.168.24.35 _admin mon mgr +controller-1.redhat.local 192.168.24.53 mon _admin mgr +controller-2.redhat.local 192.168.24.10 mon _admin mgr +6 hosts in cluster +---- + +In addition, double check the cluster is healthy and both `ceph orch ls` and +`ceph orch ps` return the expected number of deployed daemons. + +== Review and update the container image registry + +If the Ceph externalization procedure is executed **after** the Openstack control +plane has been migrated, it’s important to consider updating the container +images referenced in the Ceph cluster config. The current container images +point to the undercloud registry, and it might be no longer available. As the +undercloud won’t be available in the future, replace the undercloud provided +images with an alternative registry. +In case the desired option is to rely on the https://github.com/ceph/ceph/blob/reef/src/cephadm/cephadm.py#L48[default images] +shipped by cephadm, remove the following config options from the Ceph cluster. + + +[source,bash] +---- +$ ceph config dump +... +... +ifeval::["{build}" == "upstream"] +mgr advanced mgr/cephadm/container_image_alertmanager undercloud-0.ctlplane.redhat.local:8787/ceph/alertmanager:v0.25.0 +mgr advanced mgr/cephadm/container_image_base undercloud-0.ctlplane.redhat.local:8787/ceph/ceph:v18 +mgr advanced mgr/cephadm/container_image_grafana undercloud-0.ctlplane.redhat.local:8787/ceph/ceph-grafana:9.4.7 +mgr advanced mgr/cephadm/container_image_node_exporter undercloud-0.ctlplane.redhat.local:8787/ceph/node-exporter:v1.5.0 +mgr advanced mgr/cephadm/container_image_prometheus undercloud-0.ctlplane.redhat.local:8787/ceph/prometheus:v2.43.0 +endif::[] +ifeval::["{build}" == "downstream"] +mgr advanced mgr/cephadm/container_image_alertmanager undercloud-0.ctlplane.redhat.local:8787/rh-osbs/openshift-ose-prometheus-alertmanager:v4.10 +mgr advanced mgr/cephadm/container_image_base undercloud-0.ctlplane.redhat.local:8787/rh-osbs/rhceph +mgr advanced mgr/cephadm/container_image_grafana undercloud-0.ctlplane.redhat.local:8787/rh-osbs/grafana:latest +mgr advanced mgr/cephadm/container_image_node_exporter undercloud-0.ctlplane.redhat.local:8787/rh-osbs/openshift-ose-prometheus-node-exporter:v4.10 +mgr advanced mgr/cephadm/container_image_prometheus undercloud-0.ctlplane.redhat.local:8787/rh-osbs/openshift-ose-prometheus:v4.10 +endif::[] +---- + +Remove the undercloud Container Images + + +[source,bash] +---- +# remove the base image +cephadm shell -- ceph config rm mgr mgr/cephadm/container_image_base +# remove the undercloud images associated to the monitoring +# stack components +for i in prometheus grafana alertmanager node_exporter; do + cephadm shell -- ceph config rm mgr mgr/cephadm/container_image_$i +done +---- + +=== Note + +In the example above, in addition to the monitoring stack related +container images, we update the config entry related to the +container_image_base. This has an impact on all the Ceph daemons that rely on +the undercloud images. +New daemons will be deployed using the new/default Ceph image. + + +== Migrate Monitoring Stack to the target nodes + +The migration procedure relies on nodes re-labeling: this kind of action, +combined with an update in the existing spec, results in the daemons' +relocation on the target nodes. + +Before start this process, a few considerations are required: + +- there’s no need to migrate node exporters: these daemons are deployed across +the nodes that are part of the Ceph cluster (placement is ‘*’), and we’re +going to lose metrics as long as the controller nodes are not part of the ceph +cluster anymore + +- each monitoring stack component is bound to specific ports that TripleO is +supposed to open beforehand; make sure to double check the firewall rules are +in place and the ports are opened for a given monitoring stack service + + +== Extend the monitoring label to the target nodes + +Depending on the target nodes and the number of deployed/active daemons, it is +possible to either relocate the existing containers to the target nodes, or +select a subset of nodes that are supposed to host the monitoring stack +daemons. As we mentioned in the previous section, HA is not supported, hence +reducing the placement with `count: 1` is a reasonable solution and allows to +successfully migrate the existing daemons in an HCI (or HW limited) scenario +without impacting other services. +However, it is still possible to put in place a dedicated HA solution and +realize a component that is consistent with the TripleO model to reach HA. +Building and deployment such HA model is out of scope for this procedure. + + +=== Scenario 1: migrate the existing daemons to the target nodes + + +Assuming we have 3 CephStorage nodes or ComputeHCI, this scenario extends the +“monitoring” labels to all the CephStorage (or ComputeHCI) nodes that are part +of the cluster. This means that we keep the count: 3 placements for the target +nodes . This scenario is not recommended as we already know that any form of HA +is not supported for an external Ceph cluster. + +[source,bash] +---- +for item in $(sudo cephadm shell -- ceph orch host ls --format json | jq -r '.[].hostname'); do + sudo cephadm shell -- ceph orch host label add $item monitoring; +done +---- + +Verify all the (three) hosts have the monitoring label: + +[source,bash] +---- +[tripleo-admin@controller-0 ~]$ sudo cephadm shell -- ceph orch host ls + +HOST ADDR LABELS +cephstorage-0.redhat.local 192.168.24.11 osd monitoring +cephstorage-1.redhat.local 192.168.24.12 osd monitoring +cephstorage-2.redhat.local 192.168.24.47 osd monitoring +controller-0.redhat.local 192.168.24.35 _admin mon mgr monitoring +controller-1.redhat.local 192.168.24.53 mon _admin mgr monitoring +controller-2.redhat.local 192.168.24.10 mon _admin mgr monitoring +---- + +Remove the labels from the controller nodes + +[source,bash] +---- +$ for i in 0 1 2; do ceph orch host label rm "controller-$i.redhat.local" monitoring; done + +Removed label monitoring from host controller-0.redhat.local +Removed label monitoring from host controller-1.redhat.local +Removed label monitoring from host controller-2.redhat.local +---- + + +=== Scenario 2: reduce `count` to 1 and migrate the existing daemons to the target nodes + +Instead of adding a single `monitoring` label to all the target nodes, it is +possible to relocate one instance of each monitoring stack daemon on a +particular node. +For example, assuming we have three target nodes, we can target each of them to +host a particular daemon instance: + + +[source,bash] +---- +[tripleo-admin@controller-0 ~]$ sudo cephadm shell -- ceph orch host ls | grep -i cephstorage + +HOST ADDR LABELS +cephstorage-0.redhat.local 192.168.24.11 osd ---> grafana +cephstorage-1.redhat.local 192.168.24.12 osd ---> prometheus +cephstorage-2.redhat.local 192.168.24.47 osd ---> alertmanager +---- + +As per the example above, add the appropriate labels to the target nodes: + +[source,bash] +---- +declare -A target_nodes + +target_nodes[grafana]=cephstorage-0 +target_nodes[prometheus]=cephstorage-1 +target_nodes[alertmanager]=cephstorage-2 + +for label in "${!target_nodes[@]}"; do + ceph orch host label add ${target_nodes[$label]} $label +done +---- + +Verify the labels are properly applied to the target nodes: + +[source,bash] +---- +[tripleo-admin@controller-0 ~]$ sudo cephadm shell -- ceph orch host ls | grep -i cephstorage + +HOST ADDR LABELS STATUS +cephstorage-0.redhat.local 192.168.24.11 osd grafana +cephstorage-1.redhat.local 192.168.24.12 osd prometheus +cephstorage-2.redhat.local 192.168.24.47 osd alertmanager +---- + +== Dump the current monitoring stack spec + + +[source,bash] +---- +function export_spec { + local component="$1" + local target_dir="$2" + sudo cephadm shell -- ceph orch ls --export "$component" > "$target_dir/$component" +} + +SPEC_DIR=${SPEC_DIR:-"$PWD/ceph_specs"} +for m in grafana prometheus alertmanager; do + export_spec "$m" "$SPEC_DIR" +done +---- + +For each daemon, edit the current spec and replace the placement/hosts section +with the placement/label section, for example, in case Scenario 1 is the +adopted approach: + +[source,yaml] +---- +service_type: grafana +service_name: grafana +placement: + label: monitoring +networks: +- 172.17.3.0/24 +spec: + port: 3100 +---- + +Otherwise, if **Scenario 2** represents the desired solution, we expect to see +an output like the following: + +[source,yaml] +---- +service_type: grafana +service_name: grafana +placement: + label: grafana +networks: +- 172.17.3.0/24 +spec: + port: 3100 +---- + +The same procedure applies to prometheus and alertmanager specs. + +== Apply the new monitoring spec to relocate the monitoring stack daemons: + +[source,bash] +---- +SPEC_DIR=${SPEC_DIR:-"$PWD/ceph_specs"} +function migrate_daemon { + local component="$1" + local target_dir="$2" + sudo cephadm shell -m "$target_dir" -- ceph orch apply -i /mnt/ceph_specs/$component +} +for m in grafana prometheus alertmanager; do + migrate_daemon "$m" "$SPEC_DIR" +done +---- + +The command above results in the Ceph monitoring stack daemons migration. +Verify the daemons have been deployed on the expected nodes: + +[source,bash] +---- +[ceph: root@controller-0 /]# ceph orch ps | grep -iE "(prome|alert|grafa)" +alertmanager.cephstorage-2 cephstorage-2.redhat.local 172.17.3.144:9093,9094 +grafana.cephstorage-0 cephstorage-0.redhat.local 172.17.3.83:3100 +prometheus.cephstorage-1 cephstorage-1.redhat.local 172.17.3.53:9092 +---- + +=== Notes + +With the procedure described above we lose High Availability: the monitoring +stack daemons have no VIP and haproxy anymore; Node exporters are still +running on all the nodes: instead of using labels we keep the current approach +as we want to not reduce the monitoring space covered. + + +== Update the Ceph Dashboard mgr config + +An important aspect that should be considered at this point is to replace and +verify that the Ceph config is aligned with the relocation we just made. Run +the `ceph config dump` command and review the current config. +In particular we focus on the following config entries: + +[source,bash] +---- +mgr advanced mgr/dashboard/ALERTMANAGER_API_HOST http://172.17.3.83:9093 +mgr advanced mgr/dashboard/GRAFANA_API_URL https://172.17.3.144:3100 +mgr advanced mgr/dashboard/PROMETHEUS_API_HOST http://172.17.3.83:9092 +mgr advanced mgr/dashboard/controller-0.ycokob/server_addr 172.17.3.33 +mgr advanced mgr/dashboard/controller-1.lmzpuc/server_addr 172.17.3.147 +mgr advanced mgr/dashboard/controller-2.xpdgfl/server_addr 172.17.3.138 +---- + +Verify that `grafana`, `alertmanager` and `prometheus` `API_HOST/URL` point to +the IP addresses (on the storage network) of the node where each daemon has been +relocated. This should be automatically addressed by cephadm and it shouldn’t +require any manual action. + +[source,bash] +---- +[ceph: root@controller-0 /]# ceph orch ps | grep -iE "(prome|alert|grafa)" +alertmanager.cephstorage-0 cephstorage-0.redhat.local 172.17.3.83:9093,9094 +alertmanager.cephstorage-1 cephstorage-1.redhat.local 172.17.3.53:9093,9094 +alertmanager.cephstorage-2 cephstorage-2.redhat.local 172.17.3.144:9093,9094 +grafana.cephstorage-0 cephstorage-0.redhat.local 172.17.3.83:3100 +grafana.cephstorage-1 cephstorage-1.redhat.local 172.17.3.53:3100 +grafana.cephstorage-2 cephstorage-2.redhat.local 172.17.3.144:3100 +prometheus.cephstorage-0 cephstorage-0.redhat.local 172.17.3.83:9092 +prometheus.cephstorage-1 cephstorage-1.redhat.local 172.17.3.53:9092 +prometheus.cephstorage-2 cephstorage-2.redhat.local 172.17.3.144:9092 +---- + + +[source,bash] +---- +[ceph: root@controller-0 /]# ceph config dump +... +... +mgr advanced mgr/dashboard/ALERTMANAGER_API_HOST http://172.17.3.83:9093 +mgr advanced mgr/dashboard/PROMETHEUS_API_HOST http://172.17.3.83:9092 +mgr advanced mgr/dashboard/GRAFANA_API_URL https://172.17.3.144:3100 +---- + + +=== Note + +The **Ceph dashboard** (mgr module plugin) has not been impacted at all by this +relocation. The service is provided by the Ceph Mgr daemon, hence we might +experience an impact when the active mgr is migrated or is force-failed. +However, having three replicas definition allows to redirect requests to a +different instance (it’s still an A/P model), hence the impact should be +limited. When the RBD migration is over, the following Ceph config keys must +be regenerated to point to the right mgr container: + +[source,bash] +---- +mgr advanced mgr/dashboard/controller-0.ycokob/server_addr 172.17.3.33 +mgr advanced mgr/dashboard/controller-1.lmzpuc/server_addr 172.17.3.147 +mgr advanced mgr/dashboard/controller-2.xpdgfl/server_addr 172.17.3.138 +---- + + +[source,bash] +---- +$ sudo cephadm shell +$ ceph orch ps | awk '/mgr./ {print $1}' +---- + +and for each retrieved mgr, update the entry in the Ceph config: + +[source,bash] +---- +$ ceph config set mgr mgr/dashboard/<>/server_addr/ +---- + +== Useful resources + +* https://docs.ceph.com/en/reef/monitoring[ceph - monitoring] +* https://docs.ceph.com/en/reef/mgr/dashboard[ceph-mgr - dashboard] +* https://docs.ceph.com/en/reef/mgr/dashboard/#ssl-tls-support[ceph-dashboard - tls] diff --git a/docs_user/modules/openstack-barbican_adoption.adoc b/docs_user/modules/openstack-barbican_adoption.adoc index 56736f173..6eaf38852 100644 --- a/docs_user/modules/openstack-barbican_adoption.adoc +++ b/docs_user/modules/openstack-barbican_adoption.adoc @@ -49,6 +49,7 @@ spec: route: {} template: databaseInstance: openstack + databaseAccount: barbican databaseUser: barbican rabbitMqClusterName: rabbitmq secret: osp-secret diff --git a/docs_user/modules/openstack-cinder_adoption.adoc b/docs_user/modules/openstack-cinder_adoption.adoc index 45c0e23a1..739860317 100644 --- a/docs_user/modules/openstack-cinder_adoption.adoc +++ b/docs_user/modules/openstack-cinder_adoption.adoc @@ -691,6 +691,7 @@ spec: route: {} template: databaseInstance: openstack + databaseAccount: cinder secret: osp-secret cinderAPI: override: diff --git a/docs_user/modules/openstack-edpm_adoption.adoc b/docs_user/modules/openstack-edpm_adoption.adoc index c217d5812..772153126 100644 --- a/docs_user/modules/openstack-edpm_adoption.adoc +++ b/docs_user/modules/openstack-edpm_adoption.adoc @@ -260,7 +260,6 @@ spec: - configure-os - ssh-known-hosts - run-os - - reboot-os - install-certs - libvirt - nova-compute-extraconfig @@ -623,15 +622,15 @@ oc exec -it nova-cell1-conductor-0 -- nova-manage db online_data_migrations * Verify if Nova services can stop the existing test VM instance: + ---- -${BASH_ALIASES[openstack]} server list | grep -qF '| test | ACTIVE |' && openstack server stop test -${BASH_ALIASES[openstack]} server list | grep -qF '| test | SHUTOFF |' -${BASH_ALIASES[openstack]} server --os-compute-api-version 2.48 show --diagnostics test | grep "it is in power state shutdown" || echo PASS +${BASH_ALIASES[openstack]} server list | grep -qF '| test | ACTIVE |' && ${BASH_ALIASES[openstack]} server stop test || echo PASS +${BASH_ALIASES[openstack]} server list | grep -qF '| test | SHUTOFF |' || echo FAIL +${BASH_ALIASES[openstack]} server --os-compute-api-version 2.48 show --diagnostics test 2>&1 || echo PASS ---- * Verify if Nova services can start the existing test VM instance: + ---- -${BASH_ALIASES[openstack]} server list | grep -qF '| test | SHUTOFF |' && openstack server start test -${BASH_ALIASES[openstack]} server list | grep -F '| test | ACTIVE |' -${BASH_ALIASES[openstack]} server --os-compute-api-version 2.48 show --diagnostics test --fit-width -f json | jq -r '.state' | grep running +${BASH_ALIASES[openstack]} server list | grep -qF '| test | SHUTOFF |' && ${BASH_ALIASES[openstack]} server start test || echo PASS +${BASH_ALIASES[openstack]} server list | grep -F '| test | ACTIVE |' && \ + ${BASH_ALIASES[openstack]} server --os-compute-api-version 2.48 show --diagnostics test --fit-width -f json | jq -r '.state' | grep running || echo FAIL ---- diff --git a/docs_user/modules/openstack-glance_adoption.adoc b/docs_user/modules/openstack-glance_adoption.adoc index ac9bc8af6..b92b7e52c 100644 --- a/docs_user/modules/openstack-glance_adoption.adoc +++ b/docs_user/modules/openstack-glance_adoption.adoc @@ -45,6 +45,7 @@ spec: route: {} template: databaseInstance: openstack + databaseAccount: glance storageClass: "local-storage" storageRequest: 10G customServiceConfig: | diff --git a/docs_user/modules/openstack-heat_adoption.adoc b/docs_user/modules/openstack-heat_adoption.adoc index bf71dfaef..92eebc02a 100644 --- a/docs_user/modules/openstack-heat_adoption.adoc +++ b/docs_user/modules/openstack-heat_adoption.adoc @@ -71,6 +71,7 @@ spec: route: {} template: databaseInstance: openstack + databaseAccount: heat secret: osp-secret memcachedInstance: memcached passwordSelectors: diff --git a/docs_user/modules/openstack-manila_adoption.adoc b/docs_user/modules/openstack-manila_adoption.adoc index c45d6a0f9..1041cdd4d 100644 --- a/docs_user/modules/openstack-manila_adoption.adoc +++ b/docs_user/modules/openstack-manila_adoption.adoc @@ -313,6 +313,7 @@ spec: route: {} template: databaseInstance: openstack + databaseAccount: manila secret: osp-secret manilaAPI: replicas: 3 diff --git a/docs_user/modules/openstack-neutron_adoption.adoc b/docs_user/modules/openstack-neutron_adoption.adoc index 84dfe9fc8..ead1359da 100644 --- a/docs_user/modules/openstack-neutron_adoption.adoc +++ b/docs_user/modules/openstack-neutron_adoption.adoc @@ -50,6 +50,7 @@ spec: spec: type: LoadBalancer databaseInstance: openstack + databaseAccount: neutron secret: osp-secret networkAttachments: - internalapi diff --git a/docs_user/modules/openstack-placement_adoption.adoc b/docs_user/modules/openstack-placement_adoption.adoc index 5c777b9c3..d22a2672b 100644 --- a/docs_user/modules/openstack-placement_adoption.adoc +++ b/docs_user/modules/openstack-placement_adoption.adoc @@ -32,6 +32,7 @@ spec: route: {} template: databaseInstance: openstack + databaseAccount: placement secret: osp-secret override: service: diff --git a/docs_user/modules/openstack-planning.adoc b/docs_user/modules/openstack-planning.adoc index eee119e5c..aaf87e297 100644 --- a/docs_user/modules/openstack-planning.adoc +++ b/docs_user/modules/openstack-planning.adoc @@ -320,9 +320,6 @@ At the end of this process, you should have the following information: === IPAM planning -// TODO: explain which IP addresses will change during adoption, and which will -// stay the same. - The new deployment model puts additional burden on the size of IP allocation pools available for OpenStack services. This is because each service deployed on OpenShift worker nodes will now require an IP address from the IPAM pool (in @@ -376,7 +373,10 @@ one of the following scenarios to handle IPAM allocation in the new environment. The first listed scenario is more general and implies using new IP ranges, -while the second scenario implies reusing the existing ranges. +while the second scenario implies reusing the existing ranges. The end state of +the former scenario is using the new subnet ranges for control plane services, +but keeping the old ranges, with their node IP address allocations intact, for +EDP nodes. ==== Scenario 1: Use new subnet ranges @@ -386,16 +386,147 @@ addresses for the new control plane services. The general idea here is to define new IP ranges for control plane services that belong to a different subnet that was not used in the existing cluster. -Then, configure IP routing between the old and new subnets to allow old and new -service deployments to communicate. +Then, configure link local IP routing between the old and new subnets to allow +old and new service deployments to communicate. This involves using TripleO +mechanism on pre-adopted cluster to configure additional link local routes +there. This will allow EDP deployment to reach out to adopted nodes using their +old subnet addresses. The new subnet should be sized appropriately to accommodate the new control plane services, but otherwise doesn't have any specific requirements as to the -existing deployment allocation pools already consumed. +existing deployment allocation pools already consumed. Actually, the +requirements as to the size of the new subnet are lower than in the second +scenario, as the old subnet ranges are kept for the adopted nodes, which means +they don't consume any IP addresses from the new range. + +In this scenario, you will configure `NetworkAttachmentDefinition` CRs to use a +different subnet from what will be configured in `NetConfig` CR for the same +networks. The former range will be used for podified control plane services, +while the latter will be used to manage IPAM for EDP nodes. + +During the process, you will need to make sure that adopted node IP addresses +don't change during the adoption process. This is achieved by listing the +addresses in `fixedIP` fields in `OpenstackDataplaneNodeSet` per-node section. + +--- + +Before proceeding, configure host routes on the adopted nodes for the podified +control plane subnets. + +To achieve this, you will need to re-run `tripleo deploy` with additional +`routes` entries added to `network_config`. (This change should be applied +for every adopted node configuration.) For example, you may add the following +to `net_config.yaml`: + +```yaml +network_config: + - type: ovs_bridge + name: br-ctlplane + routes: + - ip_netmask: 0.0.0.0/0 + next_hop: 192.168.1.1 + - ip_netmask: 172.31.0.0/24 # <- new ctlplane subnet + next_hop: 192.168.1.100 # <- adopted node ctlplane IP address +``` + +Do the same for other networks that will need to use different subnets for the +new and old parts of the deployment. + +Once done, run `tripleo deploy` to apply the new configuration. + +Note that network configuration changes are not applied by default to avoid +risk of network disruption. You will have to enforce the changes by setting the +`StandaloneNetworkConfigUpdate: true` in the TripleO configuration files. + +Once `tripleo deploy` is complete, you should see new link local routes to the +new subnet on each node. For example, -// TODO: explain how routing between ranges can be configured. +```bash +# ip route | grep 172 +172.31.0.0/24 via 192.168.122.100 dev br-ctlplane +``` + +--- + +The next step is to configure similar routes for the old subnet for podified +services attached to the networks. This is done by adding `routes` entries to +`NodeNetworkConfigurationPolicy` CRs for each network. For example, + +```yaml + - destination: 192.168.122.0/24 + next-hop-interface: ospbr +``` + +Once applied, you should eventually see the following route added to your OCP nodes. + +```bash +# ip route | grep 192 +192.168.122.0/24 dev ospbr proto static scope link +``` + +--- + +At this point, you should be able to ping the adopted nodes from OCP nodes +using their old subnet addresses; and vice versa. + +--- + + +Finally, during EDPM adoption, you will have to take care of several aspects: + +- in network_config, add link local routes to the new subnets, for example: + +```yaml + nodeTemplate: + ansible: + ansibleUser: root + ansibleVars: + additional_ctlplane_host_routes: + - ip_netmask: 172.31.0.0/24 + next_hop: '{{ ctlplane_ip }}' + edpm_network_config_template: | + network_config: + - type: ovs_bridge + routes: {{ ctlplane_host_routes + additional_ctlplane_host_routes }} + ... +``` + +- list the old IP addresses as `ansibleHost` and `fixedIP`, for example: + +```yaml + nodes: + standalone: + ansible: + ansibleHost: 192.168.122.100 + ansibleUser: "" + hostName: standalone + networks: + - defaultRoute: true + fixedIP: 192.168.122.100 + name: ctlplane + subnetName: subnet1 +``` + +- expand SSH range for the firewall configuration to include both subnets: + +```yaml + edpm_sshd_allowed_ranges: + - 192.168.122.0/24 + - 172.31.0.0/24 +``` + +This is to allow SSH access from the new subnet to the adopted nodes as well as +the old one. + +--- + +Since you are applying new network configuration to the nodes, consider also +setting `edpm_network_config_update: true` to enforce the changes. + +--- -// TODO: example configurations. +Note that the examples above are incomplete and should be incorporated into +your general configuration. ==== Scenario 2: Reuse existing subnet ranges @@ -650,8 +781,18 @@ interfaces. Since EDPM nodes are not OpenShift nodes, a different approach to configure their network connectivity is used. Instead, EDPM nodes are configured by `dataplane-operator` and its CRs. The CRs -define desired network configuration for the nodes. In case of adoption, the -configuration should reflect the existing network setup. +define desired network configuration for the nodes. + +In case of adoption, the configuration should reflect the existing network +setup. You should be able to pull `net_config.yaml` files from each node and +reuse it when defining `OpenstackDataplaneNodeSet`. The format of the +configuration hasn't changed (`os-net-config` is still being used under the +hood), so you should be able to put network templates under +`edpm_network_config_template` variables (either common for all nodes, or +per-node). + +To make sure the latest network configuration is used during EDPM adoption, you +should also set `edpm_network_config_update: true` in the `nodeTemplate`. You will proceed with <> once the OpenStack podified control plane is deployed in the diff --git a/docs_user/modules/openstack-swift_adoption.adoc b/docs_user/modules/openstack-swift_adoption.adoc index 474674dd6..5bd42d4a7 100644 --- a/docs_user/modules/openstack-swift_adoption.adoc +++ b/docs_user/modules/openstack-swift_adoption.adoc @@ -4,11 +4,9 @@ = Adopting the Object Storage service -== Limitations - -* The described process does not migrate data from existing nodes yet. Data is - still stored on existing nodes, but is accessed through the Swift proxy - instance running on the OpenShift control plane. +This section only applies if you are using OpenStack Swift as Object Storage +service. If you are using the Object Storage *API* of Ceph RGW this section can +be skipped. == Prerequisites @@ -19,11 +17,9 @@ == Variables No new environmental variables need to be defined, though you use the -`CONTROLLER1_SSH` that was defined in a previous step for the pre-checks. - -== Pre-checks +`CONTROLLER1_SSH` alias that was defined in a previous step. -== Copy over swift.conf file +== Procedure - Swift adoption * Create the `swift-conf` secret, containing the Swift hash path suffix and prefix: + @@ -41,8 +37,6 @@ data: EOF ---- -== Copy existing Swift ring files - * Create the `swift-ring-files` configmap, containing the Swift ring files: + [source,yaml] @@ -57,7 +51,6 @@ binaryData: EOF ---- -== Procedure - Swift adoption * Patch OpenStackControlPlane to deploy Swift: + @@ -148,3 +141,9 @@ openstack object create test obj openstack object save test obj --file - Hello World! ---- + +== Data migration +At this point data is still stored on the previously existing nodes. The +<> + section describes how to migrate the actual data from the old +to the new deployment. diff --git a/docs_user/modules/openstack-swift_migration.adoc b/docs_user/modules/openstack-swift_migration.adoc new file mode 100644 index 000000000..aa09a20ab --- /dev/null +++ b/docs_user/modules/openstack-swift_migration.adoc @@ -0,0 +1,276 @@ +//:context: migrate-object-storage-service + +[id="migrating-the-object-storage-service_{context}"] + +This section only applies if you are using OpenStack Swift as Object Storage +service. If you are using the Object Storage *API* of Ceph RGW this section can +be skipped. + +Data migration to the new deployment might be a long running process that runs +mostly in the background. The Swift replicators will take care of moving data +from old to new nodes, but depending on the amount of used storage this might +take a very long time. You can still use the old nodes as long as they are +running and continue with adopting other services in the meantime, reducing the +amount of downtime. Please note that performance might be decreased to the +amount of replication traffic in the network. + +Migration of the data happens replica by replica. Assuming you start with 3 +replicas, only 1 one them is being moved at any time, ensuring the remaining 2 +replicas are still available and the Swift service is usable during the +migration. + += Overview + +To ensure availability during migration the following steps will be done: + +. Add new nodes to the Swift rings +. Set weights of existing nodes to 0 +. Rebalance rings, moving one replica +. Copy rings to old nodes and restart services +. Check replication status and repeat previous two steps until old nodes are +drained +. Finally remove the old nodes from the rings + += Prerequisites + +* Previous Object Storage Service adoption steps successfully completed. + +== Variables + +No new environmental variables need to be defined, though you use the +`CONTROLLER1_SSH` alias that was defined in a previous step. + += Preliminary steps + +== DNS + +All existing nodes must be able to resolve host names of the OpenShift pods, for example by using the +external IP of the DNSMasq service as name server in `/etc/resolv.conf`: + +[,bash] +---- +oc get service dnsmasq-dns -o jsonpath="{.status.loadBalancer.ingress[0].ip}" | CONTROLLER1_SSH tee /etc/resolv.conf +---- + + +== swift-dispersion + +To track the current status of the replication a tool called `swift-dispersion` +is used. It consists of two parts, a population tool to be run before changing +the Swift rings and a report tool to run afterwards to gather the current +status. Run the `swift-dispersion-populate` like this: + +[,bash] +---- +oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c 'swift-ring-tool get && swift-dispersion-populate' +---- + +The command might need a few minutes until completed. It creates 0-byte objects +distributed across the Swift deployment, and its counter-part +`swift-dispersion-report` can be used afterwards to show the current +replication status. + +The output of the `swift-dispersion-report` command should look like this: + +[,bash] +---- +oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c 'swift-ring-tool get && swift-dispersion-report' +---- + +[source] +---- +Queried 1024 containers for dispersion reporting, 5s, 0 retries +100.00% of container copies found (3072 of 3072) +Sample represents 100.00% of the container partition space +Queried 1024 objects for dispersion reporting, 4s, 0 retries +There were 1024 partitions missing 0 copies. +100.00% of object copies found (3072 of 3072) +Sample represents 100.00% of the object partition space +---- + += Migrate data + +== Add new nodes +The easiest way is to simply scale up the SwiftStorage resource from 0 to 3. In +that case 3 storage instances using PVCs are created, running on the +OpenShift cluster. + +// TODO add paragraph / link on EDPM node usage for Swift + +[,bash] +---- +oc patch openstackcontrolplane openstack --type=merge -p='{"spec":{"swift":{"template":{"swiftStorage":{"replicas": 3}}}}}' +---- + +Wait until all three pods are running: + +[,bash] +---- +oc wait pods --for condition=Ready -l component=swift-storage +---- + +== Start migration + +You can start to drain the existing nodes now. Get the storage management IP +addresses of the nodes to drain from the current rings: + +[,bash] +---- +oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c 'swift-ring-tool get && swift-ring-builder object.builder' | tail -n +7 | awk '{print $4}' | sort -u +---- + +The output will look similar to this: + +[source] +---- +172.20.0.100:6200 +swift-storage-0.swift-storage.openstack.svc:6200 +swift-storage-1.swift-storage.openstack.svc:6200 +swift-storage-2.swift-storage.openstack.svc:6200 +---- + +In this case the old node 172.20.0.100 will be drained. Your nodes might be +different, and depending on the deployment there are likely more nodes to be +included in the following commands. + +[,bash] +---- +oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c ' +swift-ring-tool get +swift-ring-tool drain 172.20.0.100 +swift-ring-tool rebalance +swift-ring-tool push' +---- + +The updated rings need to be copied and applied to the old nodes now. Run the +ssh commands for your existing nodes storing Swift data. +[,bash] +---- +oc extract --confirm cm/swift-ring-files +CONTROLLER1_SSH "tar -C /var/lib/config-data/puppet-generated/swift/etc/swift/ -xzf -" < swiftrings.tar.gz +CONTROLLER1_SSH "systemctl restart tripleo_swift_*" +---- + +You can now track the replication progress by using the +`swift-dispersion-report` tool: + +[,bash] +---- +oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c "swift-ring-tool get && swift-dispersion-report" +---- + +The output will show less than 100% of copies found, repeat the above command +until both the container and all container and object copies are found: + +[source] +---- +Queried 1024 containers for dispersion reporting, 6s, 0 retries +There were 5 partitions missing 1 copy. +99.84% of container copies found (3067 of 3072) +Sample represents 100.00% of the container partition space +Queried 1024 objects for dispersion reporting, 7s, 0 retries +There were 739 partitions missing 1 copy. +There were 285 partitions missing 0 copies. +75.94% of object copies found (2333 of 3072) +Sample represents 100.00% of the object partition space +---- + +== Move all replicas +Once all container and object copies are found it's time to move the next +replica to the new nodes. To do so, rebalance and distribute the rings again: + +[,bash] +---- +oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c ' +swift-ring-tool get +swift-ring-tool rebalance +swift-ring-tool push' + +oc extract --confirm cm/swift-ring-files +CONTROLLER1_SSH "tar -C /var/lib/config-data/puppet-generated/swift/etc/swift/ -xzf -" < swiftrings.tar.gz +CONTROLLER1_SSH "systemctl restart tripleo_swift_*" +---- + +Monitor the `swift-dispersion-report` output again, wait until all copies are +found again and repeat above step until all your replicas are moved to the new +nodes. + += Final checks + +Even if all replicas are already on the the new nodes and the +`swift-dispersion-report` command reports 100% of the copies found, there might +still be data on old nodes. This data is removed by the replicators, but it +might take some more time. + +You can check the disk usage of all disks in the cluster using the following +command: + +[,bash] +---- +oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c 'swift-ring-tool get && swift-recon -d' +---- + +Eventually your existing nodes will be drained and there should +be no more `\*.db` or `*.data` files in the directory `/srv/node` on these +nodes: + +[,bash] +---- +CONTROLLER1_SSH "find /srv/node/ -type f -name '*.db' -o -name '*.data' | wc -l" +---- + += Remove old nodes + +Once nodes are drained they should be removed from the rings using the +following commands: + +[,bash] +---- +oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c ' +swift-ring-tool get +swift-ring-tool remove 172.20.0.100 +swift-ring-tool rebalance +swift-ring-tool push' +---- + += Troubleshooting + +The following commands might be helpful to debug if the replication is not +working and the `swift-dispersion-report` is not get back to 100% availability. + + +[,bash] +---- +CONTROLLER1_SSH tail /var/log/containers/swift/swift.log | grep object-server +---- + +This should show progress by the replicators, for example like this: +[source] +---- +Mar 14 06:05:30 standalone object-server[652216]: &1 || echo PASS register: nova_verify_stop_result - until: nova_verify_stop_result is success + until: + - ("FAIL" not in nova_verify_stop_result.stdout_lines) + - ("it is in power state shutdown" in nova_verify_stop_result.stdout) retries: 10 delay: 6 - name: verify if Nova services can start the existing test VM instance ansible.builtin.shell: | - {{ shell_header }} {{ nova_header }} - ${BASH_ALIASES[openstack]} server list | grep -qF '| test | SHUTOFF |' && ${BASH_ALIASES[openstack]} server start test - ${BASH_ALIASES[openstack]} server list | grep -F '| test | ACTIVE |' - ${BASH_ALIASES[openstack]} server --os-compute-api-version 2.48 show --diagnostics test --fit-width -f json | jq -r '.state' | grep running + ${BASH_ALIASES[openstack]} server list | grep -qF '| test | SHUTOFF |' && ${BASH_ALIASES[openstack]} server start test || echo PASS + ${BASH_ALIASES[openstack]} server list | grep -F '| test | ACTIVE |' && \ + ${BASH_ALIASES[openstack]} server --os-compute-api-version 2.48 show --diagnostics test --fit-width -f json | jq -r '.state' | grep running || echo FAIL register: nova_verify_start_result - until: nova_verify_start_result is success + until: ("FAIL" not in nova_verify_start_result.stdout_lines) retries: 60 delay: 6 diff --git a/tests/roles/glance_adoption/files/glance_ceph.yaml b/tests/roles/glance_adoption/files/glance_ceph.yaml index 7987b37bf..7a429ccfd 100644 --- a/tests/roles/glance_adoption/files/glance_ceph.yaml +++ b/tests/roles/glance_adoption/files/glance_ceph.yaml @@ -3,6 +3,7 @@ spec: enabled: true template: databaseInstance: openstack + databaseAccount: glance customServiceConfig: | [DEFAULT] enabled_backends=default_backend:rbd diff --git a/tests/roles/glance_adoption/files/glance_local.yaml b/tests/roles/glance_adoption/files/glance_local.yaml index d088efda6..2fd84f8c3 100644 --- a/tests/roles/glance_adoption/files/glance_local.yaml +++ b/tests/roles/glance_adoption/files/glance_local.yaml @@ -3,6 +3,7 @@ spec: enabled: true template: databaseInstance: openstack + databaseAccount: glance customServiceConfig: | [DEFAULT] enabled_backends = default_backend:file diff --git a/tests/roles/heat_adoption/tasks/main.yaml b/tests/roles/heat_adoption/tasks/main.yaml index fc34eae63..f239caff9 100644 --- a/tests/roles/heat_adoption/tasks/main.yaml +++ b/tests/roles/heat_adoption/tasks/main.yaml @@ -22,6 +22,7 @@ route: {} template: databaseInstance: openstack + databaseAccount: heat secret: osp-secret memcachedInstance: memcached passwordSelectors: diff --git a/tests/roles/manila_adoption/tasks/main.yaml b/tests/roles/manila_adoption/tasks/main.yaml index b54187a7a..f58892b2f 100644 --- a/tests/roles/manila_adoption/tasks/main.yaml +++ b/tests/roles/manila_adoption/tasks/main.yaml @@ -11,6 +11,7 @@ route: {} template: databaseInstance: openstack + databaseAccount: manila manilaAPI: customServiceConfig: | [DEFAULT] diff --git a/tests/roles/neutron_adoption/tasks/main.yaml b/tests/roles/neutron_adoption/tasks/main.yaml index 6e0309095..c61956192 100644 --- a/tests/roles/neutron_adoption/tasks/main.yaml +++ b/tests/roles/neutron_adoption/tasks/main.yaml @@ -20,6 +20,7 @@ spec: type: LoadBalancer databaseInstance: openstack + databaseAccount: neutron secret: osp-secret networkAttachments: - internalapi diff --git a/tests/roles/placement_adoption/tasks/main.yaml b/tests/roles/placement_adoption/tasks/main.yaml index 864a2da66..3c75ea6d2 100644 --- a/tests/roles/placement_adoption/tasks/main.yaml +++ b/tests/roles/placement_adoption/tasks/main.yaml @@ -10,6 +10,7 @@ route: {} template: databaseInstance: openstack + databaseAccount: placement secret: osp-secret override: service: diff --git a/tests/roles/swift_migration/meta/main.yaml b/tests/roles/swift_migration/meta/main.yaml new file mode 100644 index 000000000..610f184fb --- /dev/null +++ b/tests/roles/swift_migration/meta/main.yaml @@ -0,0 +1,2 @@ +dependencies: + - role: common_defaults diff --git a/tests/roles/swift_migration/tasks/main.yaml b/tests/roles/swift_migration/tasks/main.yaml new file mode 100644 index 000000000..ac6aeb2df --- /dev/null +++ b/tests/roles/swift_migration/tasks/main.yaml @@ -0,0 +1,105 @@ +- name: setup nameserver on standalone node + ansible.builtin.shell: | + {{ shell_header }} + {{ oc_header }} + CONTROLLER1_SSH="{{ controller1_ssh }}" + echo "nameserver $(oc get service dnsmasq-dns -o jsonpath='{.status.loadBalancer.ingress[0].ip}')" | $CONTROLLER1_SSH tee /etc/resolv.conf + +- name: run swift-dispersion-populate + ansible.builtin.shell: | + {{ oc_header }} + oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c 'swift-ring-tool get && swift-dispersion-populate' + +- name: start swift-storage instances + ansible.builtin.shell: | + {{ oc_header }} + oc patch openstackcontrolplane openstack --type=merge -p='{"spec":{"swift":{"template":{"swiftStorage":{"replicas": 3}}}}}' + +- name: wait until all pods are ready + ansible.builtin.shell: | + {{ oc_header }} + oc wait pods --for condition=Ready -l component=swift-storage + +- name: set standalone node weight to 0 in swift rings + ansible.builtin.shell: | + {{ oc_header }} + oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c ' + swift-ring-tool get + swift-ring-tool drain 172.20.0.100 + swift-ring-tool forced_rebalance + swift-ring-tool push' + +- name: push rings to standalone and restart swift services + ansible.builtin.shell: | + {{ shell_header }} + {{ oc_header }} + CONTROLLER1_SSH="{{ controller1_ssh }}" + oc extract --confirm cm/swift-ring-files + $CONTROLLER1_SSH "tar -C /var/lib/config-data/puppet-generated/swift/etc/swift/ -xzf -" < swiftrings.tar.gz + $CONTROLLER1_SSH "systemctl restart tripleo_swift_*" + +- name: wait until all replicas are 100% available after first rebalance + ansible.builtin.shell: | + {{ oc_header }} + timeout 900s bash -c 'until oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c "swift-ring-tool get && swift-dispersion-report" | grep -q "100.00% of object copies found" ; do sleep 60; done' + +- name: rebalance rings second time + ansible.builtin.shell: | + {{ oc_header }} + oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c ' + swift-ring-tool get + swift-ring-tool forced_rebalance + swift-ring-tool push' + +- name: push rings to standalone and restart swift services + ansible.builtin.shell: | + {{ shell_header }} + {{ oc_header }} + CONTROLLER1_SSH="{{ controller1_ssh }}" + oc extract --confirm cm/swift-ring-files + $CONTROLLER1_SSH "tar -C /var/lib/config-data/puppet-generated/swift/etc/swift/ -xzf -" < swiftrings.tar.gz + $CONTROLLER1_SSH "systemctl restart tripleo_swift_*" + +- name: wait until all replicas are 100% available after second rebalance + ansible.builtin.shell: | + {{ oc_header }} + timeout 900s bash -c 'until oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c "swift-ring-tool get && swift-dispersion-report" | grep -q "100.00% of object copies found" ; do sleep 60; done' + +- name: rebalance rings third time + ansible.builtin.shell: | + {{ oc_header }} + oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c ' + swift-ring-tool get + swift-ring-tool forced_rebalance + swift-ring-tool push' + +- name: push rings to standalone and restart swift services + ansible.builtin.shell: | + {{ shell_header }} + {{ oc_header }} + CONTROLLER1_SSH="{{ controller1_ssh }}" + oc extract --confirm cm/swift-ring-files + $CONTROLLER1_SSH "tar -C /var/lib/config-data/puppet-generated/swift/etc/swift/ -xzf -" < swiftrings.tar.gz + $CONTROLLER1_SSH "systemctl restart tripleo_swift_*" + +- name: wait until all replicas are 100% available after third rebalance + ansible.builtin.shell: | + {{ oc_header }} + timeout 900s bash -c 'until oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c "swift-ring-tool get && swift-dispersion-report" | grep -q "100.00% of object copies found" ; do sleep 60; done' + +- name: wait until /srv/node on standalone is drained + ansible.builtin.shell: | + {{ shell_header }} + CONTROLLER1_SSH="{{ controller1_ssh }}" + $CONTROLLER1_SSH "timeout 900s bash -c 'while \$(find /srv/node/ -type f -name \"*.db\" -o -name \"*.data\" | grep -q \".\"); do sleep 5; done'" + +- name: remove standalone node from rings + ansible.builtin.shell: | + {{ shell_header }} + {{ oc_header }} + CONTROLLER1_SSH="{{ controller1_ssh }}" + oc debug --keep-labels=true job/swift-ring-rebalance -- /bin/sh -c ' + swift-ring-tool get + swift-ring-tool remove 172.20.0.100 + swift-ring-tool rebalance + swift-ring-tool push'