From 9e0df31aa8c96535ba7bee37975e40b8efb29a7f Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Tue, 14 Jun 2022 08:21:26 +0200 Subject: [PATCH] rocksdb-resharding: add a retry/until When retrieving the container_image, if the corresponding OSD isn't running (not started/ready yet), let's retry during 120sec before we let the playbook fail. This is because of a failure seen in the CI like following: ``` TASK [get container image currently used by osd container] ********************* task path: /home/jenkins-build/build/workspace/cephadm-ansible-prs-el8-functional/rocksdb-resharding.yml:66 Monday 13 June 2022 15:20:38 +0000 (0:00:00.015) 0:00:00.683 *********** ok: [localhost -> ceph-node0] => changed=false cmd: - cephadm - shell - ceph - orch - ps - --daemon_type - osd - --daemon_id - '0' - --format - json delta: '0:00:02.669373' end: '2022-06-13 15:20:42.149214' rc: 0 start: '2022-06-13 15:20:39.479841' stderr: |- Inferring fsid 4217f198-b8b7-11eb-941d-5254004b7a69 Inferring config /var/lib/ceph/4217f198-b8b7-11eb-941d-5254004b7a69/mon.ceph-node0/config Using ceph image with id '7d10b4103611' and tag 'master' created on 2022-05-23 21:52:02 +0000 UTC quay.ceph.io/ceph-ci/ceph@sha256:0ece388ce186bf2122eb4f3389d6b108fa94aa3541d40d08449286fffc34e29f stderr_lines: stdout: |2- [{"daemon_id": "0", "daemon_name": "osd.0", "daemon_type": "osd", "events": ["2022-06-13T15:20:20.821851Z daemon:osd.0 [INFO] \"Deployed osd.0 on host 'ceph-node4'\""], "hostname": "ceph-node4", "is_active": false, "memory_request": 4294967296, "ports": [], "service_name": "osd.osd", "status": 2, "status_desc": "starting"}] stdout_lines: ``` We can see the 'status_desc' is 'starting'. Signed-off-by: Guillaume Abrioux --- rocksdb-resharding.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rocksdb-resharding.yml b/rocksdb-resharding.yml index 9891ff3..bec6218 100644 --- a/rocksdb-resharding.yml +++ b/rocksdb-resharding.yml @@ -67,6 +67,9 @@ command: "{{ cephadm_cmd }} orch ps --daemon_type osd --daemon_id {{ osd_id }} --format json" changed_when: false register: ceph_orch_ps + retries: 120 + delay: 1 + until: (ceph_orch_ps.stdout | from_json)[0]['status_desc'] == 'running' - name: set_fact container_image, container_host set_fact: