Skip to content

Commit

Permalink
added condition to not try to kill overdue runs of the avg runtime is…
Browse files Browse the repository at this point in the history
… really short
  • Loading branch information
jtwhite79 committed Nov 28, 2024
1 parent 4b38323 commit d991722
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
11 changes: 7 additions & 4 deletions src/libs/run_managers/yamr/RunManagerPanther.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ const int RunManagerPanther::N_PINGS_UNRESPONSIVE = 3;
const int RunManagerPanther::MIN_PING_INTERVAL_SECS = 60; // Ping each slave at most once every minute
const int RunManagerPanther::MAX_PING_INTERVAL_SECS = 120; // Ping each slave at least once every 2 minutes
const int RunManagerPanther::MAX_CONCURRENT_RUNS_LOWER_LIMIT = 1;
const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
const double RunManagerPanther::MIN_AVGRUNMINS_FOR_KILL = 0.08; //minimum avg runtime to try to kill and/or resched runs


AgentInfoRec::AgentInfoRec(int _socket_fd)
Expand Down Expand Up @@ -1132,7 +1133,8 @@ void RunManagerPanther::schedule_runs()
model_runs_timed_out += overdue_kill_runs_vec.size();
}

else if (((duration > overdue_giveup_minutes) || ((duration > avg_runtime*overdue_giveup_fac) && (avg_runtime > 0.16)))
else if (((duration > overdue_giveup_minutes) || ((duration > avg_runtime*overdue_giveup_fac) &&
(avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
&& free_agent_list.empty())
{
// If there are no free slaves kill the overdue ones
Expand All @@ -1149,7 +1151,7 @@ void RunManagerPanther::schedule_runs()
model_runs_timed_out += 1;
}

else if ((duration > avg_runtime*overdue_reched_fac) && (avg_runtime > 0.16))
else if ((duration > avg_runtime*overdue_reched_fac) && (avg_runtime > MIN_AVGRUNMINS_FOR_KILL))
{
//check how many concurrent runs are going
if (n_concur < max_concurrent_runs) should_schedule = true;
Expand Down Expand Up @@ -1942,7 +1944,8 @@ void RunManagerPanther::kill_all_active_runs()
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
duration = i->second->get_duration_minute();
if ((just_quit) || (duration > overdue_giveup_minutes) ||
((duration >= avg_runtime*overdue_giveup_fac) && (avg_runtime > 0.16)))
((duration >= avg_runtime*overdue_giveup_fac) &&
(avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
{
sock_id_vec.push_back(i->second->get_socket_fd());
}
Expand Down
2 changes: 1 addition & 1 deletion src/libs/run_managers/yamr/RunManagerPanther.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ class RunManagerPanther : public RunManagerAbstract
static const int MAX_PING_INTERVAL_SECS;
static const int MAX_CONCURRENT_RUNS_LOWER_LIMIT;
static const int IDLE_THREAD_SIGNAL_TIMEOUT_SECS;

static const double MIN_AVGRUNMINS_FOR_KILL;
double overdue_reched_fac;
double overdue_giveup_fac;
double overdue_giveup_minutes;
Expand Down

0 comments on commit d991722

Please sign in to comment.