diff --git a/tests/ansible.cfg b/tests/ansible.cfg index 64a2c3867..6c3a9167c 100644 --- a/tests/ansible.cfg +++ b/tests/ansible.cfg @@ -1,3 +1,5 @@ [defaults] callback_whitelist = profile_json,profile_tasks roles_path = ./roles +module_utils=./plugins/module_utils +library=./plugins/modules diff --git a/tests/playbooks/test_externalize_ceph.yaml b/tests/playbooks/test_externalize_ceph.yaml index e5dd8e53c..75fc19332 100644 --- a/tests/playbooks/test_externalize_ceph.yaml +++ b/tests/playbooks/test_externalize_ceph.yaml @@ -75,3 +75,11 @@ tasks_from: rbd tags: - ceph_rbd + + # Dump Ceph Cluster logs to file + - name: POST + ansible.builtin.import_role: + name: ceph_migrate + tasks_from: post + tags: + - ceph_post diff --git a/tests/roles/ceph_migrate/tasks/ceph_load.yaml b/tests/roles/ceph_migrate/tasks/ceph_load.yaml index 55d08966b..0df8c8d52 100644 --- a/tests/roles/ceph_migrate/tasks/ceph_load.yaml +++ b/tests/roles/ceph_migrate/tasks/ceph_load.yaml @@ -20,6 +20,13 @@ - ceph.conf - ceph.client.admin.keyring +- name: Ensure backup directory exists + when: dump | default(false) + ansible.builtin.file: + path: "{{ ceph_config_tmp_client_home }}/logs" + state: directory + mode: '0755' + # CEPH HEALTH - name: Ceph Health block: @@ -32,7 +39,12 @@ - name: Load ceph data ansible.builtin.set_fact: ceph: "{{ ceph.stdout | from_json }}" - + - name: Dump ceph -s output to log file + when: dump | default(false) + ansible.builtin.copy: + content: "{{ ceph }}" + dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_health.log" + mode: '0644' # CEPH SERVICE MAP - name: Ceph Service Map @@ -43,15 +55,25 @@ ansible.builtin.command: | cephadm shell -- ceph orch ls -f json register: smap + - name: Load Service Map ansible.builtin.set_fact: servicemap: "{{ smap.stdout | from_json }}" + - name: Print Service Map when: debug | default(false) ansible.builtin.debug: msg: "{{ item.service_type }} - {{ item.placement }}" loop: "{{ servicemap | flatten(levels=1) }}" + # Dump config to a log file + - name: Dump ceph orch ls output to log file + when: dump | default(false) + ansible.builtin.copy: + content: "{{ servicemap }}" + dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_orch_ls.log" + mode: '0644' + # CEPH CONFIG DUMP - name: Ceph Config Dump block: @@ -61,11 +83,19 @@ ansible.builtin.command: | cephadm shell -- ceph config dump -f json register: ceph_conf + - name: Print Ceph config dump when: debug | default(false) ansible.builtin.debug: - msg: "{{ item.name }} - {{ item.value }}" - loop: "{{ ceph_confdump | flatten(levels=1) }}" + msg: "{{ ceph_conf | from_json }}" + + # Dump config to a log file + - name: Dump ceph config dump output to log file + when: dump | default(false) + ansible.builtin.copy: + content: "{{ ceph_conf.stdout | from_json }}" + dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_config_dump.log" + mode: '0644' # CEPH HOSTMAP LIST @@ -93,6 +123,12 @@ ansible.builtin.debug: msg: "{{ item }}" loop: "{{ hostmap | flatten(levels=1) }}" + - name: Dump ceph orch host ls output to log file + when: dump | default(false) + ansible.builtin.copy: + content: "{{ hostmap | flatten(levels=1) }}" + dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_orch_host_ls.log" + mode: '0644' # CEPH MON DUMP - name: Ceph Mon dump @@ -109,6 +145,12 @@ when: debug | default(false) ansible.builtin.debug: msg: "{{ mons.stdout | from_json }}" + - name: Dump ceph mon dump output to log file + when: dump | default(false) + ansible.builtin.copy: + content: "{{ mon_dump }}" + dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_mon_dump.log" + mode: '0644' # We assume that nodes to decommission are where the mon label is present, and # the target_nodes are already enrolled in the Ceph cluster: this way we can diff --git a/tests/roles/ceph_migrate/tasks/drain.yaml b/tests/roles/ceph_migrate/tasks/drain.yaml index 111671f5e..790d83ca9 100644 --- a/tests/roles/ceph_migrate/tasks/drain.yaml +++ b/tests/roles/ceph_migrate/tasks/drain.yaml @@ -6,8 +6,23 @@ ceph_fsid: "{{ mon_dump.fsid }}" ceph_cluster: ceph +# Check if mon even exists before removing it +- name: MON - wait daemons + ansible.builtin.command: "{{ ceph_cli }} orch ps --daemon_type mon --daemon_id {{ daemon_id }} -f json" + register: psm + vars: + daemon_id: "{{ host.split('.')[0] }}" + +- name: DRAIN - Delete the mon running on the current controller node + when: psm.stdout | from_json | community.general.json_query('[*].daemon_name') | length > 0 + delegate_to: "{{ host.split('.')[0] }}.ctlplane" + become: true + ansible.builtin.command: + "{{ ceph_cli }} orch daemon rm mon.{{ host.split('.')[0] }} --force" + # Remove labels from the src node - name: DRAIN - remove label from the src node + when: psm.stdout | from_json | community.general.json_query('[*].daemon_name') | length > 0 ansible.builtin.include_tasks: labels.yaml vars: nodes: @@ -18,25 +33,27 @@ - "mgr" - "_admin" -- name: DRAIN - Delete the mon running on the current controller node - delegate_to: "{{ host.split('.')[0] }}.ctlplane" - become: true - ansible.builtin.command: - "{{ ceph_cli }} orch daemon rm mon.{{ host.split('.')[0] }} --force" - # ignore_errors: true - -# Sleep before moving to the next mon +# Sleep before moving to the next task - name: Pause ansible.builtin.pause: - seconds: "{{ ceph_timeout }}" + seconds: "{{ ceph_wait_mon_timeout }}" - name: DRAIN - Drain the host + when: psm.stdout | from_json | community.general.json_query('[*].daemon_name') | length > 0 become: true delegate_to: "{{ host.split('.')[0] }}.ctlplane" ansible.builtin.command: "{{ ceph_cli }} orch host drain {{ host }}" # Cleanup leftovers from the controller node +# This task is delegated to the node that is going to be decommissioned, and +# as per the previous tasks this node has no labels, and /etc/ceph is empty, +# which means that cephadm can't really reach the Ceph cluster. +# The rm-cluster represents a shortcut to remove the leftovers from the drain +# command: instead of deleting containers with podman, and cleaning manually +# up /var/lib/ceph, we run rm-cluster that does all these actions. +# This command is not documented, and leftovers on the node don't create any +# problem. - name: DRAIN - cleanup the host delegate_to: "{{ host.split('.')[0] }}.ctlplane" become: true @@ -44,3 +61,19 @@ "cephadm rm-cluster --fsid {{ ceph_fsid }} --force" vars: ceph_fsid: "{{ mon_dump.fsid }}" + +- name: MON - Remove host from the Ceph hostmap + block: + # Check if mon even exists before removing it + - name: MON - check host in hostmap + ansible.builtin.command: "{{ ceph_cli }} orch host ls --host_pattern {{ host_id }} -f json" + register: lsh + vars: + host_id: "{{ cur_mon.split('.')[0] }}" + + # The node should be empty at this point, let's remove it from the Ceph cluster + - name: MON - rm the cur_mon host from the Ceph cluster + when: lsh.stdout | from_json | community.general.json_query('[*].hostname') | length > 0 + become: true + ansible.builtin.command: + "{{ ceph_cli }} orch host rm {{ cur_mon }} --force" diff --git a/tests/roles/ceph_migrate/tasks/mds.yaml b/tests/roles/ceph_migrate/tasks/mds.yaml index 4dd402342..d9100205c 100644 --- a/tests/roles/ceph_migrate/tasks/mds.yaml +++ b/tests/roles/ceph_migrate/tasks/mds.yaml @@ -64,6 +64,10 @@ msg: "{{ spc }}" when: debug | default(false) +- name: Wait for the orchestrator to process the spec + ansible.builtin.pause: + seconds: "{{ ceph_timeout }}" + - name: Reload MdsMap # cephadm runs w/ root privileges become: true @@ -124,3 +128,7 @@ vars: daemon: mds daemon_id: + +- name: Sleep before moving to the next phase + ansible.builtin.pause: + seconds: "{{ ceph_timeout }}" diff --git a/tests/roles/ceph_migrate/tasks/mon.yaml b/tests/roles/ceph_migrate/tasks/mon.yaml index b713eb7b3..0e3c70140 100644 --- a/tests/roles/ceph_migrate/tasks/mon.yaml +++ b/tests/roles/ceph_migrate/tasks/mon.yaml @@ -37,12 +37,14 @@ # references to the old mon daemon coming from the drained node until: (monmap.stdout | from_json | community.general.json_query('monmap.num_mons') | int) >= ((decomm_nodes |default([]) | length | int) | default(3)) loop_control: - label: "check mons quorum" + label: "MON - check mons quorum" tags: - ceph_mon_quorum - name: Backup data for client purposes delegate_to: "{{ cur_mon.split('.')[0] }}.ctlplane" + tags: + - ceph_backup block: - name: Ensure backup directory exists ansible.builtin.file: @@ -59,12 +61,10 @@ become: true ansible.builtin.copy: remote_src: true - src: "/etc/ceph/{{ item }}" + src: "{{ item.path }}" dest: "{{ ceph_config_tmp_client_home }}" mode: '0644' - loop: - - ceph.conf - - ceph.client.admin.keyring + loop: "{{ dir_ceph_files.files }}" # Before draining the current node, migrate the active mgr on a different # _admin host @@ -102,21 +102,7 @@ tags: - ceph_drain -# The node should be empty at this point, let's remove it from the Ceph -# cluster -- name: MON - rm the cur_mon host from the Ceph cluster - # when: cur_mon in decomm_nodes - become: true - ansible.builtin.command: - "{{ ceph_cli }} orch host rm {{ cur_mon }} --force" - # let's ignore this for now (for idempotency purposes) - # TODO: have a good condition here instead of ignore_errors - # ignore_errors: true - tags: - - ceph_drain - - name: MON - Get current mon IP address - # when: cur_mon in decomm_nodes ansible.builtin.set_fact: mon_ipaddr: "{{ mon_ip | split(':') | first | ansible.utils.ipaddr }}" vars: @@ -132,7 +118,6 @@ # if no mon addr, this variable is False and the whole block is skipped # because there's no network related action that should be performed when: - # - cur_mon in decomm_nodes - mon_ipaddr | default('') block: - name: MON - Get current mon IP address @@ -184,7 +169,7 @@ # addresses, it might happen that the mon is deployed using the right IP. # For this reason we need to redeploy it by rm + add (as redeploy does not # accept an IP as input - - name: MON - wait for mon + - name: MON - Check quorum become: true ansible.builtin.command: "{{ ceph_cli }} mon stat -f json" register: monstat @@ -192,7 +177,7 @@ delay: 3 until: "'{{ target_node.split('.')[0] }}' in monstat.stdout | from_json | community.general.json_query('quorum[*].name') | default([]) | list" loop_control: - label: "MON - wait for mon" + label: "MON - Check quorum" # Even though we explicitly redeploy a given mon using the host:ip format, # it is possible that the orchestrator (who owns the process and the spec) @@ -227,17 +212,35 @@ ansible.builtin.debug: msg: "{{ spc }}" + # Check if mon even exists before removing it + - name: MON - Get tmp mon + ansible.builtin.command: "{{ ceph_cli }} orch ps --daemon_type mon --daemon_id {{ daemon_id }} -f json" + register: psmon + vars: + daemon_id: "{{ target_node.split('.')[0] }}" + - name: MON - Delete the running mon + when: psmon.stdout | from_json | community.general.json_query('[*].daemon_name') | length > 0 become: true - ansible.builtin.command: - "{{ ceph_cli }} orch daemon rm mon.{{ target_node.split('.')[0] }} --force" - # TODO: replace ignore_errors with a condition - # ignore_errors: true + ansible.builtin.command: "{{ ceph_cli }} orch daemon rm mon.{{ target_node.split('.')[0] }} --force" + until: '"Removed" in rmmon.stdout' + register: rmmon + retries: 20 + delay: 3 + loop_control: + label: "MON - Get tmp mon" - - name: Wait for the spec to be updated + - name: Wait for the current mon to be deleted ansible.builtin.pause: seconds: "{{ ceph_wait_mon_timeout }}" + # Check if mon even exists before removing it + - name: MON - Check there is no mon on {{ target_node }} + ansible.builtin.command: "{{ ceph_cli }} orch ps --daemon_type mon --daemon_id {{ daemon_id }} -f json" + register: psmon + vars: + daemon_id: "{{ target_node.split('.')[0] }}" + - name: MON - Redeploy mon on {{ target_node }} when: debug | default(true) ansible.builtin.debug: @@ -247,6 +250,7 @@ become: true when: - mon_ipaddr | default('') + - psmon.stdout | from_json | community.general.json_query('[*].daemon_name') | length == 0 ansible.builtin.command: "{{ ceph_cli }} orch daemon add mon {{ target_node.split('.')[0] }}:{{ mon_ipaddr }}" @@ -264,7 +268,7 @@ # references to the old mon daemon coming from the drained node until: (monmap.stdout | from_json | community.general.json_query('monmap.num_mons') | int) >= ((decomm_nodes |default([]) | length | int) | default(3)) loop_control: - label: "check mons quorum" + label: "MON - check mons quorum" tags: - ceph_mon_quorum diff --git a/tests/roles/ceph_migrate/tasks/monitoring.yaml b/tests/roles/ceph_migrate/tasks/monitoring.yaml index a6fcbebfa..ed7520b11 100644 --- a/tests/roles/ceph_migrate/tasks/monitoring.yaml +++ b/tests/roles/ceph_migrate/tasks/monitoring.yaml @@ -100,3 +100,7 @@ - grafana - prometheus - alertmanager + +- name: Sleep before moving to the next daemon + ansible.builtin.pause: + seconds: "{{ ceph_timeout }}" diff --git a/tests/roles/ceph_migrate/tasks/nfs.yaml b/tests/roles/ceph_migrate/tasks/nfs.yaml new file mode 100644 index 000000000..68295c553 --- /dev/null +++ b/tests/roles/ceph_migrate/tasks/nfs.yaml @@ -0,0 +1,25 @@ +# Get a fresh ceph_cli +- name: Get ceph_cli + ansible.builtin.include_tasks: ceph_cli.yaml + vars: + ceph_fsid: "{{ mon_dump.fsid }}" + ceph_cluster: ceph + +# Add nfs labels to the target nodes where the cluster should be deployed +- name: NFS - Setup NFS label to the target node + ansible.builtin.include_tasks: labels.yaml + vars: + nodes: "{{ hostmap.keys() | difference(decomm_nodes) }}" + act: "add" + labels: + - "nfs" +# waiting for https://github.com/ceph/ceph/pull/53108 +# to appear in the next Ceph container build +# disabling this task by default for now +- name: Create NFS Ganesha Cluster + become: true + ansible.builtin.command: | + {{ ceph_cli }} nfs cluster create {{ cephfs_name }} \ + --ingress --virtual-ip={{ ceph_nfs_vip }} \ + --ingress-mode=haproxy-protocol '--placement=label=nfs' + changed_when: false diff --git a/tests/roles/ceph_migrate/tasks/post.yaml b/tests/roles/ceph_migrate/tasks/post.yaml new file mode 100644 index 000000000..cb4a54353 --- /dev/null +++ b/tests/roles/ceph_migrate/tasks/post.yaml @@ -0,0 +1,7 @@ +# Dump logs of the Ceph cluster daemons +- name: POST - Dump logs + ansible.builtin.include_tasks: ceph_load.yaml + vars: + dump: true + tags: + - ceph_dump diff --git a/tests/roles/ceph_migrate/tasks/rgw.yaml b/tests/roles/ceph_migrate/tasks/rgw.yaml index c859b432e..f1201b3fb 100644 --- a/tests/roles/ceph_migrate/tasks/rgw.yaml +++ b/tests/roles/ceph_migrate/tasks/rgw.yaml @@ -57,15 +57,6 @@ ansible.builtin.debug: msg: "{{ rgw_spec }}" -# - name: Apply ceph rgw keystone config -# #become: true -# ansible.builtin.command: | -# echo "{{ ceph_cli }} config set global rgw_keystone_url {{ ceph_keystone_ep }}" -# changed_when: false -# when: -# - ceph_keystone_ep is defined -# - ceph_keystone_ep | length > 0 - # Update and apply the spec: it will update the Monitoring Stack deployment, # and place daemons on the target nodes - name: Update the RGW spec definition