Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat t2 #323

Merged
merged 10 commits into from
Dec 2, 2024
Binary file not shown.
6 changes: 3 additions & 3 deletions documentation/pestpp_users_manual.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@

<img src="./media/image1.png" style="width:6.26806in;height:1.68194in" alt="A close up of a purple sign Description automatically generated" />

# <a id='s1' />Version 5.2.15
# <a id='s1' />Version 5.2.16

<img src="./media/image2.png" style="width:6.26806in;height:3.05972in" />

PEST++ Development Team

November 2024
December 2024

# <a id='s2' />Acknowledgements

Expand Down Expand Up @@ -70,7 +70,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI

# Table of Contents

- [Version 5.2.15](#s1)
- [Version 5.2.16](#s1)
- [Acknowledgements](#s2)
- [Preface](#s3)
- [License](#s4)
Expand Down
2 changes: 1 addition & 1 deletion src/libs/common/config_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CONFIG_OS_H_


#define PESTPP_VERSION "5.2.15";
#define PESTPP_VERSION "5.2.16";

#if defined(_WIN32) || defined(_WIN64)
#define OS_WIN
Expand Down
2 changes: 1 addition & 1 deletion src/libs/pestpp_common/EnsembleMethodUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7750,7 +7750,7 @@ void EnsembleMethod::reset_par_ensemble_to_prior_mean(){
ss << "iteration:" << iter;
vector<int> temp;
ofstream& frec = file_manager.rec_ofstream();
oe = oe_base;
oe.reserve(oe_base.get_real_names(),oe.get_var_names());
weights = weights_base;
run_ensemble_util(performance_log,frec,new_pe,oe,run_mgr_ptr,false,temp,NetPackage::NULL_DA_CYCLE, ss.str());
pe = new_pe;
Expand Down
4 changes: 2 additions & 2 deletions src/libs/run_managers/yamr/PantherAgent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ void PANTHERAgent::process_ctl_file(const string &ctl_filename)
mi.set_fill_tpl_zeros(pest_scenario.get_pestpp_options().get_fill_tpl_zeros());
mi.set_tpl_force_decimal(pest_scenario.get_pestpp_options().get_tpl_force_decimal());
mi.set_num_threads(pest_scenario.get_pestpp_options().get_num_tpl_ins_threads());
mi.set_sleep_ms(100);
mi.set_sleep_ms(5);
restart_on_error = pest_scenario.get_pestpp_options().get_panther_agent_restart_on_error();
max_time_without_master_ping_seconds = pest_scenario.get_pestpp_options().get_panther_agent_no_ping_timeout_secs();
FileManager fm("panther_agent");
Expand Down Expand Up @@ -538,7 +538,7 @@ std::pair<NetPackage::PackType,std::string> PANTHERAgent::run_model(Parameters &
void PANTHERAgent::run_async(pest_utils::thread_flag* terminate, pest_utils::thread_flag* finished, exception_ptr& run_exception,
Parameters* pars, Observations* obs)
{
mi.set_sleep_ms(100);
mi.set_sleep_ms(5);
mi.run(terminate,finished,run_exception, pars, obs);
}

Expand Down
41 changes: 26 additions & 15 deletions src/libs/run_managers/yamr/RunManagerPanther.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ const int RunManagerPanther::N_PINGS_UNRESPONSIVE = 3;
const int RunManagerPanther::MIN_PING_INTERVAL_SECS = 60; // Ping each slave at most once every minute
const int RunManagerPanther::MAX_PING_INTERVAL_SECS = 120; // Ping each slave at least once every 2 minutes
const int RunManagerPanther::MAX_CONCURRENT_RUNS_LOWER_LIMIT = 1;
const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)

const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
const double RunManagerPanther::MIN_AVGRUNMINS_FOR_KILL = 0.08; //minimum avg runtime to try to kill and/or resched runs
const int RunManagerPanther::SECONDS_BETWEEN_ECHOS = 1;

AgentInfoRec::AgentInfoRec(int _socket_fd)
{
Expand Down Expand Up @@ -520,6 +521,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
}

std::chrono::system_clock::time_point start_time = std::chrono::system_clock::now();
last_echo_time = std::chrono::system_clock::now();
double run_time_sec = 0.0;
while (!all_runs_complete() && terminate_reason == RUN_UNTIL_COND::NORMAL)
{
Expand Down Expand Up @@ -560,7 +562,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
}

}
w_sleep(100);
w_sleep(10);
n_no_ops = 0;
while (true)
{
Expand Down Expand Up @@ -726,7 +728,7 @@ void RunManagerPanther::run_idle_async()
idling.set(false);

// Sleep 1s to avoid spinlock
w_sleep(100);
w_sleep(10);
continue;
}

Expand Down Expand Up @@ -816,7 +818,7 @@ void RunManagerPanther::end_run_idle_async()
}

// Sleep to avoid spinlock
w_sleep(50);
w_sleep(10);
}

report("Stopped idle ping thread, as Panther manager is shutting down.", false);
Expand Down Expand Up @@ -857,7 +859,7 @@ void RunManagerPanther::pause_idle()
}

// Sleep to avoid spinlock
w_sleep(50);
w_sleep(10);
}

report("Panther idle ping thread paused prior to scheduling runs.", false);
Expand Down Expand Up @@ -947,7 +949,7 @@ bool RunManagerPanther::listen(pest_utils::thread_flag* terminate/* = nullptr*/)
fd_set read_fds; // temp file descriptor list for select()
socklen_t addr_len;
timeval tv;
tv.tv_sec = 1;
tv.tv_sec = 0;
tv.tv_usec = 0;
read_fds = master; // copy it
if (w_select(fdmax+1, &read_fds, NULL, NULL, &tv) == -1)
Expand Down Expand Up @@ -1006,7 +1008,7 @@ void RunManagerPanther::close_agents()
sock_nums.push_back(si.first);
for (auto si : sock_nums)
close_agent(si);
w_sleep(100);
w_sleep(10);

}
}
Expand Down Expand Up @@ -1107,7 +1109,7 @@ void RunManagerPanther::schedule_runs()
duration = it_agent->get_duration_minute();
avg_runtime = it_agent->get_runtime_minute();
if (avg_runtime <= 0) avg_runtime = global_avg_runtime;
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
if (avg_runtime <= 0) avg_runtime = 1.0E+300;
vector<int> overdue_kill_runs_vec = get_overdue_runs_over_kill_threshold(run_id);

if (failure_map.count(run_id) + overdue_kill_runs_vec.size() >= max_n_failure)
Expand All @@ -1131,7 +1133,9 @@ void RunManagerPanther::schedule_runs()
should_schedule = true;
model_runs_timed_out += overdue_kill_runs_vec.size();
}
else if (((duration > overdue_giveup_minutes) || (duration > avg_runtime*overdue_giveup_fac))

else if (((duration > overdue_giveup_minutes) || ((duration > avg_runtime*overdue_giveup_fac) &&
(avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
&& free_agent_list.empty())
{
// If there are no free slaves kill the overdue ones
Expand All @@ -1147,7 +1151,8 @@ void RunManagerPanther::schedule_runs()
}
model_runs_timed_out += 1;
}
else if (duration > avg_runtime*overdue_reched_fac)

else if ((duration > avg_runtime*overdue_reched_fac) && (avg_runtime > MIN_AVGRUNMINS_FOR_KILL))
{
//check how many concurrent runs are going
if (n_concur < max_concurrent_runs) should_schedule = true;
Expand Down Expand Up @@ -1285,6 +1290,10 @@ void RunManagerPanther::echo()
{
if (!should_echo)
return;
std::chrono::system_clock::time_point now = chrono::system_clock::now();
if (chrono::duration_cast<std::chrono::seconds> ( now- last_echo_time).count() < SECONDS_BETWEEN_ECHOS)
return;
last_echo_time = now;
map<string, int> stats_map = get_agent_stats();
cout << get_time_string_short() << " mn:" << setw(5) << setprecision(2) << left << get_global_runtime_minute() << " runs("
<< "C" << setw(5) << left << model_runs_done
Expand Down Expand Up @@ -1939,7 +1948,9 @@ void RunManagerPanther::kill_all_active_runs()
if (avg_runtime <= 0) avg_runtime = get_global_runtime_minute();;
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
duration = i->second->get_duration_minute();
if ((just_quit) || (duration > overdue_giveup_minutes) || (duration >= avg_runtime*overdue_giveup_fac))
if ((just_quit) || (duration > overdue_giveup_minutes) ||
((duration >= avg_runtime*overdue_giveup_fac) &&
(avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
{
sock_id_vec.push_back(i->second->get_socket_fd());
}
Expand Down Expand Up @@ -2132,7 +2143,7 @@ RunManagerPanther::~RunManagerPanther(void)
err = w_close(listener);
FD_CLR(listener, &master);
// this is needed to ensure that the first slave closes properly
w_sleep(500);
w_sleep(10);
for (int i = 0; i <= fdmax; i++)
{
if (FD_ISSET(i, &master))
Expand Down Expand Up @@ -2248,10 +2259,10 @@ void RunManagerYAMRCondor::cleanup(int cluster)
stringstream ss;
ss << "condor_rm " << cluster << " 1>cr_temp.stdout 2>cr_temp.stderr";
system(ss.str().c_str());
w_sleep(500);
w_sleep(10);
ss.str(string());
ss << "condor_rm " << cluster << " -forcex 1>cr_temp.stdout 2>cr_temp.stderr";
w_sleep(500);
w_sleep(10);
system(ss.str().c_str());
RunManagerPanther::close_agents();
cout << " all agents freed " << endl << endl;
Expand Down
4 changes: 3 additions & 1 deletion src/libs/run_managers/yamr/RunManagerPanther.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ class RunManagerPanther : public RunManagerAbstract
static const int MAX_PING_INTERVAL_SECS;
static const int MAX_CONCURRENT_RUNS_LOWER_LIMIT;
static const int IDLE_THREAD_SIGNAL_TIMEOUT_SECS;

static const double MIN_AVGRUNMINS_FOR_KILL;
static const int SECONDS_BETWEEN_ECHOS;
double overdue_reched_fac;
double overdue_giveup_fac;
double overdue_giveup_minutes;
Expand All @@ -141,6 +142,7 @@ class RunManagerPanther : public RunManagerAbstract
long long bytes_transferred;
int files_transferred;
bool should_echo;
std::chrono::system_clock::time_point last_echo_time;
int nftx;
fd_set master; // master file descriptor list
list<AgentInfoRec> agent_info_set;
Expand Down
Loading