Skip to content

Commit

Permalink
Improve on_connected callback to avoid reconnect storm (#292)
Browse files Browse the repository at this point in the history
* Ensure there is only one watcher refresh per time window

* Fix indentation

* Fix typo

* Refactor and unit test reconnect time

* Fix ruby 1.9.3 ci

* Move to sleep after test_and_set

* revert .travis.yml

* Fix typo

* Bump up version to 0.16.12
  • Loading branch information
anson627 authored Jun 17, 2019
1 parent 49acb38 commit 10c783d
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
synapse (0.16.11)
synapse (0.16.12)
aws-sdk (~> 1.39)
docker-api (~> 1.7)
dogstatsd-ruby (~> 3.3.0)
Expand Down
32 changes: 32 additions & 0 deletions lib/synapse/service_watcher/zookeeper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,27 @@ def zk_connect
zk_cleanup
end

# handle session connected after reconnecting
# http://zookeeper.apache.org/doc/r3.3.5/zookeeperProgrammers.html#ch_zkSessions
@zk.on_connected do
log.info "synapse: ZK client has reconnected #{@name}"
unless test_and_set_reconnect_time(Time.now)
log.info "synapse: ZK client skip since last reconnect is too close #{@name}"
return
end

# random backoff to avoid refreshing all watchers at the same time
sleep rand(30)

# zookeeper watcher is one-time trigger, and can be lost when disconnected
# https://zookeeper.apache.org/doc/r3.3.5/zookeeperProgrammers.html#ch_zkWatches
# only need re-enable watcher on parent path and children list
log.info "synapse: ZK client refresh watcher after reconnected #{@name}"
if @zk.exists?(@discovery['path'], :watch => true)
@zk.children(@discovery['path'], :watch => true)
end
end

# the path must exist, otherwise watch callbacks will not work
statsd_time('synapse.watcher.zk.create_path.elapsed_time', ["zk_cluster:#{@zk_cluster}", "service_name:#{@name}"]) do
create(@discovery['path'])
Expand All @@ -373,6 +394,17 @@ def zk_connect
end
end

def test_and_set_reconnect_time(now)
# ensure there is only one refresh can happen within a time window
if !@last_reconnect_time.nil? && (now - @last_reconnect_time) < 60
return false
end
# test-and-set should be thread safe based on per-callback model
# https://github.com/zk-ruby/zk/wiki/EventDeliveryModel
@last_reconnect_time = now
true

end
# decode the data at a zookeeper endpoint
def deserialize_service_instance(data)
log.debug "synapse: deserializing process data"
Expand Down
2 changes: 1 addition & 1 deletion lib/synapse/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module Synapse
VERSION = "0.16.11"
VERSION = "0.16.12"
end
11 changes: 11 additions & 0 deletions spec/lib/synapse/service_watcher_zookeeper_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,17 @@
subject.send(:watcher_callback).call
end

it 'test and set reconnect time' do
now = Time.now
expect(subject.send(:test_and_set_reconnect_time, now)).to be true
now += 10
expect(subject.send(:test_and_set_reconnect_time, now)).to be false
now += 50
expect(subject.send(:test_and_set_reconnect_time, now)).to be true
now += 60
expect(subject.send(:test_and_set_reconnect_time, now)).to be true
end

it 'handles zk consistency issues' do
expect(subject).to receive(:watch)
expect(subject).to receive(:discover).and_call_original
Expand Down

0 comments on commit 10c783d

Please sign in to comment.