Skip to content

Commit

Permalink
Retrieve exec_hosts from bjobs
Browse files Browse the repository at this point in the history
Propagate exec_hosts to scheduler
  • Loading branch information
andreas-el committed Oct 2, 2024
1 parent 8ff7299 commit 757acbe
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 4 deletions.
1 change: 1 addition & 0 deletions src/_ert/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class RealizationBaseEvent(BaseEvent):
real: str
ensemble: Union[str, None] = None
queue_event_type: Union[str, None] = None
exec_hosts: Union[str, None] = None


class RealizationPending(RealizationBaseEvent):
Expand Down
7 changes: 7 additions & 0 deletions src/ert/ensemble_evaluator/snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def update_realization(
status: str,
start_time: Optional[datetime] = None,
end_time: Optional[datetime] = None,
exec_hosts: Optional[str] = None,
callback_status_message: Optional[str] = None,
) -> "EnsembleSnapshot":
self._realization_snapshots[real_id].update(
Expand All @@ -260,6 +261,7 @@ def update_realization(
status=status,
start_time=start_time,
end_time=end_time,
exec_hosts=exec_hosts,
callback_status_message=callback_status_message,
)
)
Expand All @@ -279,10 +281,12 @@ def update_from_event(
status = _FM_TYPE_EVENT_TO_STATUS[type(event)]
start_time = None
end_time = None
exec_hosts = None
callback_status_message = None

if e_type is RealizationRunning:
start_time = convert_iso8601_to_datetime(timestamp)
exec_hosts = event.exec_hosts
elif e_type in {
RealizationSuccess,
RealizationFailed,
Expand All @@ -296,6 +300,7 @@ def update_from_event(
status,
start_time,
end_time,
exec_hosts,
callback_status_message,
)

Expand Down Expand Up @@ -397,6 +402,7 @@ class RealizationSnapshot(TypedDict, total=False):
active: Optional[bool]
start_time: Optional[datetime]
end_time: Optional[datetime]
exec_hosts: Optional[str]
fm_steps: Dict[str, FMStepSnapshot]
callback_status_message: Optional[str]

Expand All @@ -411,6 +417,7 @@ def _realization_dict_to_realization_snapshot(
end_time=source.get("end_time"),
callback_status_message=source.get("callback_status_message"),
fm_steps=source.get("fm_steps", {}),
exec_hosts=source.get("exec_hosts"),
)
return _filter_nones(realization)

Expand Down
1 change: 1 addition & 0 deletions src/ert/gui/model/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class RealNodeData:
real_status_color: Optional[QColor] = None
current_memory_usage: Optional[int] = None
max_memory_usage: Optional[int] = None
exec_hosts: Optional[str] = None
stderr: Optional[str] = None
callback_status_message: Optional[str] = None

Expand Down
2 changes: 2 additions & 0 deletions src/ert/gui/model/snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ def _update_snapshot(self, snapshot: EnsembleSnapshot, iter_: str) -> None:
data = real_node.data
if real_status := real.get("status"):
data.status = real_status
if real_exec_hosts := real.get("exec_hosts"):
data.exec_hosts = real_exec_hosts
for real_fm_step_id, color in (
metadata["aggr_fm_step_status_colors"].get(real_id, {}).items()
):
Expand Down
13 changes: 12 additions & 1 deletion src/ert/gui/simulation/run_dialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,21 @@ def on_snapshot_new_iteration(
def _select_real(self, index: QModelIndex) -> None:
real = index.row()
iter_ = index.model().get_iter() # type: ignore
exec_hosts = None

iter_node = self._snapshot_model.root.children.get(str(iter_), None)
if iter_node:
real_node = iter_node.children.get(str(real), None)
if real_node:
exec_hosts = real_node.data.exec_hosts

self._fm_step_overview.set_realization(iter_, real)
self._fm_step_label.setText(
text = (
f"Realization id {index.data(RealIens)} in iteration {index.data(IterNum)}"
)
if exec_hosts and exec_hosts != "-":
text += f", assigned to host: [{exec_hosts}]"
self._fm_step_label.setText(text)

def closeEvent(self, a0: Optional[QCloseEvent]) -> None:
if not self._notifier.is_simulation_running:
Expand Down
2 changes: 2 additions & 0 deletions src/ert/scheduler/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
@dataclass
class StartedEvent:
iens: int
exec_hosts: str = ""


@dataclass
class FinishedEvent:
iens: int
returncode: int
exec_hosts: str = ""


Event = Union[StartedEvent, FinishedEvent]
2 changes: 2 additions & 0 deletions src/ert/scheduler/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(self, scheduler: Scheduler, real: Realization) -> None:
self.real = real
self.state = JobState.WAITING
self.started = asyncio.Event()
self.exec_hosts: str = "-"
self.returncode: asyncio.Future[int] = asyncio.Future()
self._aborted = False
self._scheduler: Scheduler = scheduler
Expand Down Expand Up @@ -263,6 +264,7 @@ async def _send(self, state: JobState) -> None:
"event_type": _queue_jobstate_event_type[state],
"queue_event_type": state,
"real": str(self.iens),
"exec_hosts": self.exec_hosts,
}
self.state = state
if state == JobState.FAILED:
Expand Down
9 changes: 7 additions & 2 deletions src/ert/scheduler/lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ async def submit(
iens=iens,
job_state=QueuedJob(job_state="PEND"),
submitted_timestamp=time.time(),
exec_hosts="-",
)
self._iens2jobid[iens] = job_id

Expand Down Expand Up @@ -500,11 +501,15 @@ async def _process_job_update(self, job_id: str, new_state: AnyJob) -> None:
event: Optional[Event] = None
if isinstance(new_state, RunningJob):
logger.debug(f"Realization {iens} is running")
event = StartedEvent(iens=iens)
event = StartedEvent(iens=iens, exec_hosts=self._jobs[job_id].exec_hosts)
elif isinstance(new_state, FinishedJobFailure):
logger.info(f"Realization {iens} (LSF-id: {self._iens2jobid[iens]}) failed")
exit_code = await self._get_exit_code(job_id)
event = FinishedEvent(iens=iens, returncode=exit_code)
event = FinishedEvent(
iens=iens,
returncode=exit_code,
exec_hosts=self._jobs[job_id].exec_hosts,
)
elif isinstance(new_state, FinishedJobSuccess):
logger.info(
f"Realization {iens} (LSF-id: {self._iens2jobid[iens]}) succeeded"
Expand Down
5 changes: 4 additions & 1 deletion src/ert/scheduler/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from ert.constant_filenames import CERT_FILE

from .driver import Driver
from .event import FinishedEvent
from .event import FinishedEvent, StartedEvent
from .job import Job, JobState

if TYPE_CHECKING:
Expand Down Expand Up @@ -308,6 +308,9 @@ async def _process_event_queue(self) -> None:
# Any event implies the job has at least started
job.started.set()

if isinstance(event, (StartedEvent, FinishedEvent)) and event.exec_hosts:
self._jobs[event.iens].exec_hosts = event.exec_hosts

if (
isinstance(event, FinishedEvent)
and not self._cancelled
Expand Down

0 comments on commit 757acbe

Please sign in to comment.