From 677c0d157809c8308afc52aa9e8c31d85dd8463e Mon Sep 17 00:00:00 2001 From: Basha Mougamadou Date: Wed, 16 Oct 2024 15:38:02 +0200 Subject: [PATCH] Add retry on consul lock if failures Since consul lock is based on consul infrastructure, we can hit issues when consul is restarting while locking choregraphie on consul. Indeed, we had cases where consul on leader node took times to restart causing Diplomat call failure. --- libraries/primitive_consul_lock.rb | 33 ++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/libraries/primitive_consul_lock.rb b/libraries/primitive_consul_lock.rb index 37df8c7..96ff3dd 100644 --- a/libraries/primitive_consul_lock.rb +++ b/libraries/primitive_consul_lock.rb @@ -120,6 +120,11 @@ def self.get_or_create(path, concurrency, dc: nil, token: nil) current_lock = begin Chef::Log.info "Fetch lock state for #{path}" Diplomat::Kv.get(path, decode_values: true, dc: dc, token: token) + rescue Faraday::ConnectionFailed => e + retry_secs = 30 + Chef::Log.info "Consul did not respond, wait #{retry_secs} seconds and retry to let it (re)start: #{e}" + sleep retry_secs + (retry_left -= 1).positive? ? retry : raise rescue Diplomat::KeyNotFound Chef::Log.info "Lock for #{path} did not exist, creating with value #{value}" Diplomat::Kv.put(path, value.to_json, cas: 0, dc: dc, token: token) # we ignore success/failure of CaS @@ -166,9 +171,17 @@ def enter(opts) if can_enter_lock?(opts) enter_lock(opts) require 'diplomat' - result = Diplomat::Kv.put(@path, to_json, cas: @cas, dc: @dc, token: @token) - Chef::Log.debug('Someone updated the lock at the same time, will retry') unless result - result + retry_left = 5 + begin + result = Diplomat::Kv.put(@path, to_json, cas: @cas, dc: @dc, token: @token) + Chef::Log.debug('Someone updated the lock at the same time, will retry') unless result + result + rescue Faraday::ConnectionFailed => e + retry_secs = 30 + Chef::Log.info "Consul did not respond, wait #{retry_secs} seconds and retry to let it (re)start: #{e}" + sleep retry_secs + (retry_left -= 1).positive? ? retry : raise + end else Chef::Log.debug("Too many lock holders (concurrency:#{concurrency})") false @@ -184,9 +197,17 @@ def exit(opts) if already_entered?(opts) exit_lock(opts) require 'diplomat' - result = Diplomat::Kv.put(@path, to_json, cas: @cas, dc: @dc, token: @token) - Chef::Log.debug('Someone updated the lock at the same time, will retry') unless result - result + retry_left = 5 + begin + result = Diplomat::Kv.put(@path, to_json, cas: @cas, dc: @dc, token: @token) + Chef::Log.debug('Someone updated the lock at the same time, will retry') unless result + result + rescue Faraday::ConnectionFailed => e + retry_secs = 30 + Chef::Log.info "Consul did not respond, wait #{retry_secs} seconds and retry to let it (re)start: #{e}" + sleep retry_secs + (retry_left -= 1).positive? ? retry : raise + end else true end