Skip to content

Commit

Permalink
added condition to not try to kill overdue runs of the avg runtime is…
Browse files Browse the repository at this point in the history
… under 10sec
jtwhite79 committed Nov 28, 2024
1 parent 9f28f69 commit 4b38323
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions src/libs/run_managers/yamr/RunManagerPanther.cpp
Original file line number Diff line number Diff line change
@@ -1107,7 +1107,7 @@ void RunManagerPanther::schedule_runs()
duration = it_agent->get_duration_minute();
avg_runtime = it_agent->get_runtime_minute();
if (avg_runtime <= 0) avg_runtime = global_avg_runtime;
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
if (avg_runtime <= 0) avg_runtime = 1.0E+300;
vector<int> overdue_kill_runs_vec = get_overdue_runs_over_kill_threshold(run_id);

if (failure_map.count(run_id) + overdue_kill_runs_vec.size() >= max_n_failure)
@@ -1131,7 +1131,8 @@ void RunManagerPanther::schedule_runs()
should_schedule = true;
model_runs_timed_out += overdue_kill_runs_vec.size();
}
else if (((duration > overdue_giveup_minutes) || (duration > avg_runtime*overdue_giveup_fac))

else if (((duration > overdue_giveup_minutes) || ((duration > avg_runtime*overdue_giveup_fac) && (avg_runtime > 0.16)))
&& free_agent_list.empty())
{
// If there are no free slaves kill the overdue ones
@@ -1147,7 +1148,8 @@ void RunManagerPanther::schedule_runs()
}
model_runs_timed_out += 1;
}
else if (duration > avg_runtime*overdue_reched_fac)

else if ((duration > avg_runtime*overdue_reched_fac) && (avg_runtime > 0.16))
{
//check how many concurrent runs are going
if (n_concur < max_concurrent_runs) should_schedule = true;
@@ -1939,7 +1941,8 @@ void RunManagerPanther::kill_all_active_runs()
if (avg_runtime <= 0) avg_runtime = get_global_runtime_minute();;
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
duration = i->second->get_duration_minute();
if ((just_quit) || (duration > overdue_giveup_minutes) || (duration >= avg_runtime*overdue_giveup_fac))
if ((just_quit) || (duration > overdue_giveup_minutes) ||
((duration >= avg_runtime*overdue_giveup_fac) && (avg_runtime > 0.16)))
{
sock_id_vec.push_back(i->second->get_socket_fd());
}

0 comments on commit 4b38323

Please sign in to comment.