Skip to content
This repository has been archived by the owner on Jul 19, 2021. It is now read-only.

Commit

Permalink
Add support for unknown status when lsf commands are unavailable
Browse files Browse the repository at this point in the history
  • Loading branch information
ManInFez committed Aug 1, 2019
1 parent c4cdbae commit bf25e19
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 9 deletions.
9 changes: 5 additions & 4 deletions lib/include/ert/job_queue/job_status.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,16 +98,17 @@ typedef enum {
JOB_QUEUE_RUNNING_EXIT_CALLBACK = 2048,
JOB_QUEUE_STATUS_FAILURE = 4096,
JOB_QUEUE_FAILED = 8192,
JOB_QUEUE_DO_KILL_NODE_FAILURE = 16384
JOB_QUEUE_DO_KILL_NODE_FAILURE = 16384,
JOB_QUEUE_UNKNOWN = 32768
} job_status_type;

#define JOB_QUEUE_RUNNING_CALLBACK (JOB_QUEUE_RUNNING_DONE_CALLBACK + JOB_QUEUE_RUNNING_EXIT_CALLBACK)

#define JOB_QUEUE_STATUS_ALL (JOB_QUEUE_NOT_ACTIVE + JOB_QUEUE_WAITING + JOB_QUEUE_SUBMITTED + JOB_QUEUE_PENDING + JOB_QUEUE_RUNNING + JOB_QUEUE_DONE + \
JOB_QUEUE_EXIT + JOB_QUEUE_IS_KILLED + JOB_QUEUE_DO_KILL + JOB_QUEUE_SUCCESS + JOB_QUEUE_RUNNING_CALLBACK + \
JOB_QUEUE_STATUS_FAILURE + JOB_QUEUE_FAILED + JOB_QUEUE_DO_KILL_NODE_FAILURE)
JOB_QUEUE_STATUS_FAILURE + JOB_QUEUE_FAILED + JOB_QUEUE_DO_KILL_NODE_FAILURE + JOB_QUEUE_UNKNOWN)

#define JOB_QUEUE_MAX_STATE 15
#define JOB_QUEUE_MAX_STATE 16

/*
All jobs which are in the status set defined by
Expand All @@ -127,7 +128,7 @@ typedef enum {

#define JOB_QUEUE_WAITING_STATUS (JOB_QUEUE_WAITING + JOB_QUEUE_PENDING)

#define JOB_QUEUE_CAN_UPDATE_STATUS (JOB_QUEUE_RUNNING + JOB_QUEUE_PENDING + JOB_QUEUE_SUBMITTED)
#define JOB_QUEUE_CAN_UPDATE_STATUS (JOB_QUEUE_RUNNING + JOB_QUEUE_PENDING + JOB_QUEUE_SUBMITTED + JOB_QUEUE_UNKNOWN)

#define JOB_QUEUE_COMPLETE_STATUS (JOB_QUEUE_IS_KILLED + JOB_QUEUE_SUCCESS + JOB_QUEUE_FAILED)

Expand Down
1 change: 1 addition & 0 deletions lib/job_queue/job_queue_status.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ job_queue_status_type * job_queue_status_alloc() {
status->status_index[12] = JOB_QUEUE_STATUS_FAILURE; //The driver call to get status has failed, job status remains unchanged
status->status_index[13] = JOB_QUEUE_FAILED; // Job has failed, no more retries, FINAL STATE
status->status_index[14] = JOB_QUEUE_DO_KILL_NODE_FAILURE; // Job has failed, node should be blacklisted
status->status_index[15] = JOB_QUEUE_UNKNOWN; // Unable to get status from submitted job

return status;
}
Expand Down
3 changes: 3 additions & 0 deletions lib/job_queue/job_status.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ const char * job_status_get_name(job_status_type status) {
case JOB_QUEUE_DO_KILL_NODE_FAILURE:
return "JOB_QUEUE_DO_KILL_NODE_FAIURE";
break;
case JOB_QUEUE_UNKNOWN:
return "JOB_QUEUE_UNKNOWN";
break;
}

util_abort("%s: internal error", __func__);
Expand Down
7 changes: 3 additions & 4 deletions lib/job_queue/lsf_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ static void lsf_driver_update_bjobs_table(lsf_driver_type * driver) {
{
char user[32];
char status[16];
FILE *stream = util_fopen(tmp_file , "r");;
FILE *stream = util_fopen(tmp_file , "r");
bool at_eof = false;
hash_clear(driver->bjobs_cache);
util_fskip_lines(stream , 1);
Expand Down Expand Up @@ -841,8 +841,7 @@ static int lsf_driver_get_job_status_shell(void * __driver , void * __job) {
res_log_info("Have turned lsf debug info ON.");
}
status = lsf_driver_get_bhist_status_shell( driver , job );
if (status != JOB_STAT_UNKWN)
hash_insert_int( driver->bjobs_cache , job->lsf_jobnr_char , status );
hash_insert_int( driver->bjobs_cache , job->lsf_jobnr_char , status );
}
}
}
Expand Down Expand Up @@ -879,7 +878,7 @@ job_status_type lsf_driver_convert_status( int lsf_status ) {
job_status = JOB_QUEUE_EXIT;
break;
case JOB_STAT_UNKWN: // Have lost contact with one of the daemons.
job_status = JOB_QUEUE_EXIT;
job_status = JOB_QUEUE_UNKNOWN;
break;
case JOB_STAT_DONE + JOB_STAT_PDONE: // = 192. JOB_STAT_PDONE: the job had a
// post-execution script which completed
Expand Down
2 changes: 1 addition & 1 deletion lib/job_queue/tests/job_lsf_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void test_status_tr() {
test_status( JOB_STAT_NULL , JOB_QUEUE_NOT_ACTIVE );
test_status( JOB_STAT_DONE , JOB_QUEUE_DONE );
test_status( JOB_STAT_EXIT , JOB_QUEUE_EXIT );
test_status( JOB_STAT_UNKWN , JOB_QUEUE_EXIT );
test_status( JOB_STAT_UNKWN , JOB_QUEUE_UNKNOWN );
test_status( 192 , JOB_QUEUE_DONE );
}

Expand Down
2 changes: 2 additions & 0 deletions python/res/job_queue/job_status_type_enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class JobStatusType(BaseCEnum):
JOB_QUEUE_STATUS_FAILURE = None
JOB_QUEUE_FAILED = None
JOB_QUEUE_DO_KILL_NODE_FAILURE = None
JOB_QUEUE_UNKNOWN = None

@classmethod
def from_string(cls, string):
Expand All @@ -54,3 +55,4 @@ def from_string(cls, string):
JobStatusType.addEnum("JOB_QUEUE_STATUS_FAILURE", 4096)
JobStatusType.addEnum("JOB_QUEUE_FAILED", 8192)
JobStatusType.addEnum("JOB_QUEUE_DO_KILL_NODE_FAILURE", 16384)
JobStatusType.addEnum("JOB_QUEUE_UNKNOWN", 32768)

0 comments on commit bf25e19

Please sign in to comment.