Skip to content

Commit

Permalink
Make mon redeploy more solid and add nfs and log dump
Browse files Browse the repository at this point in the history
This patch does a few things:
1. it makes the mon daemon delete and redeploy more solid
2. it adds a post.yaml plugged into the main playbook: right now it only
   contains log dump, and ceph_load.yaml is reused for this purpose
3. it adds a wait() after each daemon migration
4. it makes drain.yaml more solid and removes most of the ignore_errors
   previously set
5. it adds nfs.yaml that can be called to deploy a ceph_nfs cluster
   during the manila migration.

Signed-off-by: Francesco Pantano <[email protected]>
  • Loading branch information
fmount committed May 15, 2024
1 parent 4fafe4f commit 1780efe
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 40 deletions.
8 changes: 8 additions & 0 deletions tests/playbooks/test_externalize_ceph.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,11 @@
tasks_from: rbd
tags:
- ceph_rbd

# Dump Ceph Cluster logs to file
- name: POST
ansible.builtin.import_role:
name: ceph_migrate
tasks_from: post
tags:
- ceph_post
48 changes: 45 additions & 3 deletions tests/roles/ceph_migrate/tasks/ceph_load.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@
- ceph.conf
- ceph.client.admin.keyring

- name: Ensure backup directory exists
when: dump | default(false)
ansible.builtin.file:
path: "{{ ceph_config_tmp_client_home }}/logs"
state: directory
mode: '0755'

# CEPH HEALTH
- name: Ceph Health
block:
Expand All @@ -32,7 +39,12 @@
- name: Load ceph data
ansible.builtin.set_fact:
ceph: "{{ ceph.stdout | from_json }}"

- name: Dump ceph -s output to log file
when: dump | default(false)
ansible.builtin.copy:
content: "{{ ceph }}"
dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_health.log"
mode: '0644'

# CEPH SERVICE MAP
- name: Ceph Service Map
Expand All @@ -43,15 +55,25 @@
ansible.builtin.command: |
cephadm shell -- ceph orch ls -f json
register: smap

- name: Load Service Map
ansible.builtin.set_fact:
servicemap: "{{ smap.stdout | from_json }}"

- name: Print Service Map
when: debug | default(false)
ansible.builtin.debug:
msg: "{{ item.service_type }} - {{ item.placement }}"
loop: "{{ servicemap | flatten(levels=1) }}"

# Dump config to a log file
- name: Dump ceph orch ls output to log file
when: dump | default(false)
ansible.builtin.copy:
content: "{{ servicemap }}"
dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_orch_ls.log"
mode: '0644'

# CEPH CONFIG DUMP
- name: Ceph Config Dump
block:
Expand All @@ -61,11 +83,19 @@
ansible.builtin.command: |
cephadm shell -- ceph config dump -f json
register: ceph_conf

- name: Print Ceph config dump
when: debug | default(false)
ansible.builtin.debug:
msg: "{{ item.name }} - {{ item.value }}"
loop: "{{ ceph_confdump | flatten(levels=1) }}"
msg: "{{ ceph_conf | from_json }}"

# Dump config to a log file
- name: Dump ceph config dump output to log file
when: dump | default(false)
ansible.builtin.copy:
content: "{{ ceph_conf.stdout | from_json }}"
dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_config_dump.log"
mode: '0644'


# CEPH HOSTMAP LIST
Expand Down Expand Up @@ -93,6 +123,12 @@
ansible.builtin.debug:
msg: "{{ item }}"
loop: "{{ hostmap | flatten(levels=1) }}"
- name: Dump ceph orch host ls output to log file
when: dump | default(false)
ansible.builtin.copy:
content: "{{ hostmap | flatten(levels=1) }}"
dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_orch_host_ls.log"
mode: '0644'

# CEPH MON DUMP
- name: Ceph Mon dump
Expand All @@ -109,6 +145,12 @@
when: debug | default(false)
ansible.builtin.debug:
msg: "{{ mons.stdout | from_json }}"
- name: Dump ceph mon dump output to log file
when: dump | default(false)
ansible.builtin.copy:
content: "{{ mon_dump }}"
dest: "{{ ceph_config_tmp_client_home }}/logs/ceph_mon_dump.log"
mode: '0644'

# We assume that nodes to decommission are where the mon label is present, and
# the target_nodes are already enrolled in the Ceph cluster: this way we can
Expand Down
43 changes: 34 additions & 9 deletions tests/roles/ceph_migrate/tasks/drain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,23 @@
ceph_fsid: "{{ mon_dump.fsid }}"
ceph_cluster: ceph

# Check if mon even exists before removing it
- name: MON - wait daemons
ansible.builtin.command: "{{ ceph_cli }} orch ps --daemon_type mon --daemon_id {{ daemon_id }} -f json"
register: psm
vars:
daemon_id: "{{ host.split('.')[0] }}"

- name: DRAIN - Delete the mon running on the current controller node
when: psm.stdout | from_json | community.general.json_query('[*].daemon_name') | length > 0
delegate_to: "{{ host.split('.')[0] }}.ctlplane"
become: true
ansible.builtin.command:
"{{ ceph_cli }} orch daemon rm mon.{{ host.split('.')[0] }} --force"

# Remove labels from the src node
- name: DRAIN - remove label from the src node
when: psm.stdout | from_json | community.general.json_query('[*].daemon_name') | length > 0
ansible.builtin.include_tasks: labels.yaml
vars:
nodes:
Expand All @@ -18,19 +33,13 @@
- "mgr"
- "_admin"

- name: DRAIN - Delete the mon running on the current controller node
delegate_to: "{{ host.split('.')[0] }}.ctlplane"
become: true
ansible.builtin.command:
"{{ ceph_cli }} orch daemon rm mon.{{ host.split('.')[0] }} --force"
# ignore_errors: true

# Sleep before moving to the next mon
# Sleep before moving to the next task
- name: Pause
ansible.builtin.pause:
seconds: "{{ ceph_timeout }}"
seconds: "{{ ceph_wait_mon_timeout }}"

- name: DRAIN - Drain the host
when: psm.stdout | from_json | community.general.json_query('[*].daemon_name') | length > 0
become: true
delegate_to: "{{ host.split('.')[0] }}.ctlplane"
ansible.builtin.command:
Expand All @@ -44,3 +53,19 @@
"cephadm rm-cluster --fsid {{ ceph_fsid }} --force"
vars:
ceph_fsid: "{{ mon_dump.fsid }}"

- name: MON - Remove host from the Ceph hostmap
block:
# Check if mon even exists before removing it
- name: MON - check host in hostmap
ansible.builtin.command: "{{ ceph_cli }} orch host ls --host_pattern {{ host_id }} -f json"
register: lsh
vars:
host_id: "{{ cur_mon.split('.')[0] }}"

# The node should be empty at this point, let's remove it from the Ceph cluster
- name: MON - rm the cur_mon host from the Ceph cluster
when: lsh.stdout | from_json | community.general.json_query('[*].hostname') | length > 0
become: true
ansible.builtin.command:
"{{ ceph_cli }} orch host rm {{ cur_mon }} --force"
4 changes: 4 additions & 0 deletions tests/roles/ceph_migrate/tasks/mds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,7 @@
vars:
daemon: mds
daemon_id:

- name: Sleep before moving to the next phase
ansible.builtin.pause:
seconds: "{{ ceph_timeout }}"
60 changes: 32 additions & 28 deletions tests/roles/ceph_migrate/tasks/mon.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@
# references to the old mon daemon coming from the drained node
until: (monmap.stdout | from_json | community.general.json_query('monmap.num_mons') | int) >= ((decomm_nodes |default([]) | length | int) | default(3))
loop_control:
label: "check mons quorum"
label: "MON - check mons quorum"
tags:
- ceph_mon_quorum

- name: Backup data for client purposes
delegate_to: "{{ cur_mon.split('.')[0] }}.ctlplane"
tags:
- ceph_backup
block:
- name: Ensure backup directory exists
ansible.builtin.file:
Expand All @@ -59,12 +61,10 @@
become: true
ansible.builtin.copy:
remote_src: true
src: "/etc/ceph/{{ item }}"
src: "{{ item.path }}"
dest: "{{ ceph_config_tmp_client_home }}"
mode: '0644'
loop:
- ceph.conf
- ceph.client.admin.keyring
loop: "{{ dir_ceph_files.files }}"

# Before draining the current node, migrate the active mgr on a different
# _admin host
Expand Down Expand Up @@ -102,21 +102,7 @@
tags:
- ceph_drain

# The node should be empty at this point, let's remove it from the Ceph
# cluster
- name: MON - rm the cur_mon host from the Ceph cluster
# when: cur_mon in decomm_nodes
become: true
ansible.builtin.command:
"{{ ceph_cli }} orch host rm {{ cur_mon }} --force"
# let's ignore this for now (for idempotency purposes)
# TODO: have a good condition here instead of ignore_errors
# ignore_errors: true
tags:
- ceph_drain

- name: MON - Get current mon IP address
# when: cur_mon in decomm_nodes
ansible.builtin.set_fact:
mon_ipaddr: "{{ mon_ip | split(':') | first | ansible.utils.ipaddr }}"
vars:
Expand All @@ -132,7 +118,6 @@
# if no mon addr, this variable is False and the whole block is skipped
# because there's no network related action that should be performed
when:
# - cur_mon in decomm_nodes
- mon_ipaddr | default('')
block:
- name: MON - Get current mon IP address
Expand Down Expand Up @@ -184,15 +169,15 @@
# addresses, it might happen that the mon is deployed using the right IP.
# For this reason we need to redeploy it by rm + add (as redeploy does not
# accept an IP as input
- name: MON - wait for mon
- name: MON - Check quorum
become: true
ansible.builtin.command: "{{ ceph_cli }} mon stat -f json"
register: monstat
retries: 20
delay: 3
until: "'{{ target_node.split('.')[0] }}' in monstat.stdout | from_json | community.general.json_query('quorum[*].name') | default([]) | list"
loop_control:
label: "MON - wait for mon"
label: "MON - Check quorum"

# Even though we explicitly redeploy a given mon using the host:ip format,
# it is possible that the orchestrator (who owns the process and the spec)
Expand Down Expand Up @@ -227,17 +212,35 @@
ansible.builtin.debug:
msg: "{{ spc }}"

# Check if mon even exists before removing it
- name: MON - Get tmp mon
ansible.builtin.command: "{{ ceph_cli }} orch ps --daemon_type mon --daemon_id {{ daemon_id }} -f json"
register: psmon
vars:
daemon_id: "{{ target_node.split('.')[0] }}"

- name: MON - Delete the running mon
when: psmon.stdout | from_json | community.general.json_query('[*].daemon_name') | length > 0
become: true
ansible.builtin.command:
"{{ ceph_cli }} orch daemon rm mon.{{ target_node.split('.')[0] }} --force"
# TODO: replace ignore_errors with a condition
# ignore_errors: true
ansible.builtin.command: "{{ ceph_cli }} orch daemon rm mon.{{ target_node.split('.')[0] }} --force"
until: '"Removed" in rmmon.stdout'
register: rmmon
retries: 20
delay: 3
loop_control:
label: "MON - Get tmp mon"

- name: Wait for the spec to be updated
- name: Wait for the current mon to be deleted
ansible.builtin.pause:
seconds: "{{ ceph_wait_mon_timeout }}"

# Check if mon even exists before removing it
- name: MON - Check there is no mon on {{ target_node }}
ansible.builtin.command: "{{ ceph_cli }} orch ps --daemon_type mon --daemon_id {{ daemon_id }} -f json"
register: psmon
vars:
daemon_id: "{{ target_node.split('.')[0] }}"

- name: MON - Redeploy mon on {{ target_node }}
when: debug | default(true)
ansible.builtin.debug:
Expand All @@ -247,6 +250,7 @@
become: true
when:
- mon_ipaddr | default('')
- psmon.stdout | from_json | community.general.json_query('[*].daemon_name') | length == 0
ansible.builtin.command:
"{{ ceph_cli }} orch daemon add mon {{ target_node.split('.')[0] }}:{{ mon_ipaddr }}"

Expand All @@ -264,7 +268,7 @@
# references to the old mon daemon coming from the drained node
until: (monmap.stdout | from_json | community.general.json_query('monmap.num_mons') | int) >= ((decomm_nodes |default([]) | length | int) | default(3))
loop_control:
label: "check mons quorum"
label: "MON - check mons quorum"
tags:
- ceph_mon_quorum

Expand Down
4 changes: 4 additions & 0 deletions tests/roles/ceph_migrate/tasks/monitoring.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,7 @@
- grafana
- prometheus
- alertmanager

- name: Sleep before moving to the next daemon
ansible.builtin.pause:
seconds: "{{ ceph_timeout }}"
25 changes: 25 additions & 0 deletions tests/roles/ceph_migrate/tasks/nfs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Get a fresh ceph_cli
- name: Get ceph_cli
ansible.builtin.include_tasks: ceph_cli.yaml
vars:
ceph_fsid: "{{ mon_dump.fsid }}"
ceph_cluster: ceph

# Add nfs labels to the target nodes where the cluster should be deployed
- name: NFS - Setup NFS label to the target node
ansible.builtin.include_tasks: labels.yaml
vars:
nodes: "{{ hostmap.keys() | difference(decomm_nodes) }}"
act: "add"
labels:
- "nfs"
# waiting for https://github.com/ceph/ceph/pull/53108
# to appear in the next Ceph container build
# disabling this task by default for now
- name: Create NFS Ganesha Cluster
become: true
ansible.builtin.command: |
{{ ceph_cli }} nfs cluster create {{ cephfs_name }} \
--ingress --virtual-ip={{ ceph_nfs_vip }} \
--ingress-mode=haproxy-protocol '--placement=label=nfs'
changed_when: false
7 changes: 7 additions & 0 deletions tests/roles/ceph_migrate/tasks/post.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Dump logs of the Ceph cluster daemons
- name: POST - Dump logs
ansible.builtin.include_tasks: ceph_load.yaml
vars:
dump: true
tags:
- ceph_dump

0 comments on commit 1780efe

Please sign in to comment.