Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added metrics to Scheduler and track in Process state #984

Merged
merged 23 commits into from
Jul 4, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b657d1f
added metrics
AndersGM Jun 13, 2023
3111a28
thread safe counters
AndersGM Jun 20, 2023
1f3ae5f
reset metrics after restart
AndersGM Jun 20, 2023
2960a86
specced both error and success in #task_observer
AndersGM Jun 20, 2023
30db43a
added doc
AndersGM Jun 20, 2023
78d8aba
include metrics in name
AndersGM Jun 20, 2023
042ed03
Merge branch 'main' into metrics
AndersGM Jun 20, 2023
17e5bd7
renamed and reverted
AndersGM Jun 25, 2023
02dae4a
Merge remote-tracking branch 'fork/metrics' into metrics
AndersGM Jun 25, 2023
afeae87
consistent naming
AndersGM Jun 25, 2023
2c3c21e
fixe broken current state
AndersGM Jun 25, 2023
6069cac
Merge branch 'main' into metrics
AndersGM Jun 25, 2023
822a0be
Merge remote-tracking branch 'origin/main' into metrics
bensheldon Jul 1, 2023
e9a26d2
Track Scheduler name and queues separately in stats and display
bensheldon Jul 1, 2023
28b99ad
Fix Scheduler tests
bensheldon Jul 1, 2023
6c9af51
Remove default value for state
bensheldon Jul 1, 2023
3ca81d4
Renamed `failed_` to `errored_`
bensheldon Jul 1, 2023
e59c8da
Track empty and unlocked executions too
bensheldon Jul 1, 2023
74ad726
Have stat totals add up consistently
bensheldon Jul 1, 2023
6fe747a
Ensure stats can be reported when shutdown
bensheldon Jul 1, 2023
169a23c
Don't add object to instances list until fully initialized
bensheldon Jul 1, 2023
0964768
Fix race condition in Scheduler#stats test
bensheldon Jul 1, 2023
b8f6f08
Rename "unlocked" to "unexecutable" and fix counting
bensheldon Jul 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion app/models/good_job/process.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ def self.current_state
proctitle: $PROGRAM_NAME,
preserve_job_records: GoodJob.preserve_job_records,
retry_on_unhandled_error: GoodJob.retry_on_unhandled_error,
schedulers: GoodJob::Scheduler.instances.map(&:name),
schedulers: GoodJob::Scheduler.instances.map(&:stats),
bensheldon marked this conversation as resolved.
Show resolved Hide resolved
cron_enabled: GoodJob.configuration.enable_cron?,
total_succeeded_executions_count: GoodJob::Scheduler.instances.sum(&:succeeded_executions_count),
total_failed_executions_count: GoodJob::Scheduler.instances.sum(&:failed_executions_count),
}
end

Expand Down
2 changes: 1 addition & 1 deletion lib/good_job/cleanup_tracker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def initialize(cleanup_interval_seconds: false, cleanup_interval_jobs: false)
end

# Increments job count.
# @return [void]
# @return [Integer]
def increment
self.job_count += 1
end
Expand Down
47 changes: 47 additions & 0 deletions lib/good_job/metrics.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# frozen_string_literal: true
module GoodJob # :nodoc:
# Metrics for the scheduler.
class Metrics
def initialize
AndersGM marked this conversation as resolved.
Show resolved Hide resolved
@failed_count = Concurrent::AtomicFixnum.new
@succeeded_count = Concurrent::AtomicFixnum.new
end

# Increments number of failed jobs.
# @return [Integer]
def increment_failed
@failed_count.increment
end

# Increments number of succeeded jobs.
# @return [Integer]
def increment_succeeded
@succeeded_count.increment
end

# Failed job count.
# @return [Integer]
def failed_count
@failed_count.value
end

# Number of succeeded jobs.
# @return [Integer]
def succeeded_count
@succeeded_count.value
end

# Total number of jobs (failed + succeeded).
# @return [Integer]
def total_count
failed_count + succeeded_count
end

# Reset counters.
# @return [void]
def reset
@failed_count.value = 0
@succeeded_count.value = 0
end
end
end
13 changes: 13 additions & 0 deletions lib/good_job/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require "concurrent/executor/timer_set"
require "concurrent/scheduled_task"
require "concurrent/utility/processor_counter"
require 'good_job/metrics'

module GoodJob # :nodoc:
#
Expand Down Expand Up @@ -88,6 +89,8 @@ def initialize(performer, max_threads: nil, max_cache: nil, warm_cache_on_initia
@executor_options[:name] = name

@cleanup_tracker = CleanupTracker.new(cleanup_interval_seconds: cleanup_interval_seconds, cleanup_interval_jobs: cleanup_interval_jobs)
@metrics = ::GoodJob::Metrics.new
@executor_options[:name] = name
create_executor
warm_cache if warm_cache_on_initialize
end
Expand Down Expand Up @@ -139,6 +142,7 @@ def restart(timeout: -1)

instrument("scheduler_restart_pools") do
shutdown(timeout: timeout)
@metrics.reset
create_executor
warm_cache
end
Expand Down Expand Up @@ -203,6 +207,12 @@ def task_observer(time, output, thread_error)
instrument("finished_job_task", { result: output, error: thread_error, time: time })
return unless output

if error
@metrics.increment_failed
else
@metrics.increment_succeeded
end

@cleanup_tracker.increment
if @cleanup_tracker.cleanup?
cleanup
Expand All @@ -222,6 +232,9 @@ def stats
max_cache: @max_cache,
active_cache: cache_count,
available_cache: remaining_cache_count,
succeeded_executions_count: @metrics.succeeded_count,
failed_executions_count: @metrics.failed_count,
total_executions_count: @metrics.total_count,
}
end

Expand Down
52 changes: 51 additions & 1 deletion spec/lib/good_job/scheduler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
described_class.instances.each(&:shutdown)
end

describe 'name' do
describe '#name' do
it 'is human readable and contains configuration values' do
scheduler = described_class.new(performer)
expect(scheduler.name).to eq('GoodJob::Scheduler(queues= max_threads=5)')
Expand Down Expand Up @@ -76,6 +76,34 @@
end
end

describe '#task_observer' do
it 'increases metric counters' do
allow(GoodJob).to receive(:on_thread_error)

failed_job_count = 0
succeeded_job_count = 0

allow(performer).to receive(:next) do
if failed_job_count < 7
failed_job_count += 1
GoodJob::ExecutionResult.new(value: nil, unhandled_error: StandardError.new("oopsy"))
elsif succeeded_job_count < 9
succeeded_job_count += 1
GoodJob::ExecutionResult.new(value: 'success')
end
end

scheduler = described_class.new(performer)
scheduler.create_thread

sleep_until { succeeded_job_count == 9 }

expect(scheduler.stats.fetch(:failed_executions_count)).to eq 7
expect(scheduler.stats.fetch(:succeeded_executions_count)).to eq 9
expect(scheduler.stats.fetch(:total_executions_count)).to eq 16
end
end

describe '#shutdown' do
it 'shuts down the theadpools' do
scheduler = described_class.new(performer)
Expand Down Expand Up @@ -115,6 +143,25 @@
.to change(scheduler, :running?).from(false).to(true)
end

it 'resets metrics' do
allow(performer).to receive(:next).and_return GoodJob::ExecutionResult.new(value: 'hello'), nil

scheduler = described_class.new(performer)
scheduler.create_thread

sleep_until do
scheduler.stats.fetch(:succeeded_executions_count) == 1
end

scheduler.shutdown
expect(scheduler.stats.fetch(:succeeded_executions_count)).to eq 1

expect { scheduler.restart }
.to change(scheduler, :running?).from(false).to(true)

expect(scheduler.stats.fetch(:succeeded_executions_count)).to eq 0
end

it 'can be called multiple times' do
scheduler = described_class.new(performer)
scheduler.shutdown
Expand Down Expand Up @@ -177,6 +224,9 @@
max_cache: max_cache,
active_cache: 0,
available_cache: max_cache,
failed_executions_count: 0,
succeeded_executions_count: 0,
total_executions_count: 0,
})
end
end
Expand Down