From 7762f6fd60c185998204ea71d4eba286f338f77a Mon Sep 17 00:00:00 2001 From: xjules Date: Mon, 11 Nov 2024 15:30:45 +0100 Subject: [PATCH] Implementing router-dealer pattern with custom acknowledgments with zmq - dealers always wait for acknowledgment from the evaluator - removing websockets, no more wait_for_evaluator - Settup encryption with curve - each dealer (client, dispatcher) will get a unique name - Make sure to check cancellation error when sending event from client - Monitor is an advanced version Client - Make sure to wait to ensemble start before initiating monitor - Use TCP protocol only when using LSF, SLURM or TORQUE queues - Remove certificate - Use ipc_protocol when using LOCAL driver --- docs/ert/conf.py | 1 - pyproject.toml | 2 +- src/_ert/forward_model_runner/cli.py | 9 +- src/_ert/forward_model_runner/client.py | 209 +++++++------ .../forward_model_runner/reporting/event.py | 111 ++++--- src/ert/cli/main.py | 5 +- src/ert/ensemble_evaluator/__init__.py | 2 - src/ert/ensemble_evaluator/_ensemble.py | 42 +-- .../ensemble_evaluator/_wait_for_evaluator.py | 63 ---- src/ert/ensemble_evaluator/config.py | 87 ++---- src/ert/ensemble_evaluator/evaluator.py | 281 +++++++++--------- .../evaluator_connection_info.py | 15 +- src/ert/ensemble_evaluator/monitor.py | 100 +------ src/ert/gui/simulation/run_dialog.py | 6 +- src/ert/logging/logger.conf | 4 +- src/ert/run_models/base_run_model.py | 9 +- src/ert/scheduler/scheduler.py | 15 - src/ert/shared/net_utils.py | 1 + tests/ert/conftest.py | 4 +- tests/ert/ui_tests/cli/test_cli.py | 26 +- .../ensemble_evaluator_utils.py | 28 -- .../test_ensemble_client.py | 82 ++--- .../test_ensemble_evaluator.py | 114 +++---- .../test_ensemble_evaluator_config.py | 37 ++- .../test_ensemble_legacy.py | 25 +- .../ensemble_evaluator/test_monitor.py | 181 +++++------ .../ensemble_evaluator/test_scheduler.py | 3 +- .../test_event_reporter.py | 271 +++++++---------- .../forward_model_runner/test_job_dispatch.py | 13 +- .../unit_tests/scheduler/test_scheduler.py | 8 +- .../unit_tests/shared/test_port_handler.py | 1 + tests/ert/unit_tests/test_tracking.py | 3 - tests/ert/utils.py | 104 ++++--- 33 files changed, 780 insertions(+), 1082 deletions(-) diff --git a/docs/ert/conf.py b/docs/ert/conf.py index 71a069bd046..74b0a485078 100644 --- a/docs/ert/conf.py +++ b/docs/ert/conf.py @@ -67,7 +67,6 @@ ("py:class", "pydantic.types.PositiveInt"), ("py:class", "LibresFacade"), ("py:class", "pandas.core.frame.DataFrame"), - ("py:class", "websockets.server.WebSocketServerProtocol"), ("py:class", "EnsembleReader"), ] nitpick_ignore_regex = [ diff --git a/pyproject.toml b/pyproject.toml index cfe295eabf8..02d56476d19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dependencies = [ "python-dateutil", "python-multipart", # extra dependency for fastapi "pyyaml", + "pyzmq", "qtpy", "requests", "resfo", @@ -68,7 +69,6 @@ dependencies = [ "tqdm>=4.62.0", "typing_extensions>=4.5", "uvicorn >= 0.17.0", - "websockets", "xarray", "xtgeo >= 3.3.0", ] diff --git a/src/_ert/forward_model_runner/cli.py b/src/_ert/forward_model_runner/cli.py index 6f99ef576c9..433e0442bc0 100644 --- a/src/_ert/forward_model_runner/cli.py +++ b/src/_ert/forward_model_runner/cli.py @@ -22,7 +22,6 @@ def _setup_reporters( ens_id, dispatch_url, ee_token=None, - ee_cert_path=None, experiment_id=None, ) -> list[reporting.Reporter]: reporters: list[reporting.Reporter] = [] @@ -30,11 +29,7 @@ def _setup_reporters( reporters.append(reporting.Interactive()) elif ens_id and experiment_id is None: reporters.append(reporting.File()) - reporters.append( - reporting.Event( - evaluator_url=dispatch_url, token=ee_token, cert_path=ee_cert_path - ) - ) + reporters.append(reporting.Event(evaluator_url=dispatch_url, token=ee_token)) else: reporters.append(reporting.File()) return reporters @@ -123,7 +118,6 @@ def main(args): experiment_id = jobs_data.get("experiment_id") ens_id = jobs_data.get("ens_id") ee_token = jobs_data.get("ee_token") - ee_cert_path = jobs_data.get("ee_cert_path") dispatch_url = jobs_data.get("dispatch_url") is_interactive_run = len(parsed_args.job) > 0 @@ -132,7 +126,6 @@ def main(args): ens_id, dispatch_url, ee_token, - ee_cert_path, experiment_id, ) diff --git a/src/_ert/forward_model_runner/client.py b/src/_ert/forward_model_runner/client.py index ea798522b86..0892a26eecb 100644 --- a/src/_ert/forward_model_runner/client.py +++ b/src/_ert/forward_model_runner/client.py @@ -1,16 +1,12 @@ +from __future__ import annotations + import asyncio import logging -import ssl -from typing import Any, AnyStr, Self - -from websockets.asyncio.client import ClientConnection, connect -from websockets.datastructures import Headers -from websockets.exceptions import ( - ConnectionClosedError, - ConnectionClosedOK, - InvalidHandshake, - InvalidURI, -) +import uuid +from typing import Any, Self + +import zmq +import zmq.asyncio from _ert.async_utils import new_event_loop @@ -25,108 +21,145 @@ class ClientConnectionClosedOK(Exception): pass +CONNECT_MSG = "CONNECT" +DISCONNECT_MSG = "DISCONNECT" +ACK_MSG = b"ACK" + + class Client: - DEFAULT_MAX_RETRIES = 10 - DEFAULT_TIMEOUT_MULTIPLIER = 5 - CONNECTION_TIMEOUT = 60 + DEFAULT_MAX_RETRIES = 5 + DEFAULT_ACK_TIMEOUT = 5 + _receiver_task: asyncio.Task[None] | None def __enter__(self) -> Self: + self.loop.run_until_complete(self.__aenter__()) return self + def term(self) -> None: + self.socket.close() + self.context.term() + def __exit__(self, exc_type: Any, exc_value: Any, exc_traceback: Any) -> None: - if self.websocket is not None: - self.loop.run_until_complete(self.websocket.close()) + self.loop.run_until_complete(self.__aexit__(exc_type, exc_value, exc_traceback)) self.loop.close() - async def __aenter__(self) -> "Client": + async def __aenter__(self) -> Self: + await self.connect() return self async def __aexit__( self, exc_type: Any, exc_value: Any, exc_traceback: Any ) -> None: - if self.websocket is not None: - await self.websocket.close() + try: + await self._send(DISCONNECT_MSG) + except ClientConnectionError: + logger.error("No ack for dealer disconnection. Connection is down!") + finally: + self.socket.disconnect(self.url) + await self._term_receiver_task() + self.term() + + async def _term_receiver_task(self) -> None: + if self._receiver_task and not self._receiver_task.done(): + self._receiver_task.cancel() + await asyncio.gather(self._receiver_task, return_exceptions=True) + self._receiver_task = None def __init__( self, url: str, token: str | None = None, - cert: str | bytes | None = None, - max_retries: int | None = None, - timeout_multiplier: int | None = None, + dealer_name: str | None = None, + ack_timeout: float | None = None, ) -> None: - if max_retries is None: - max_retries = self.DEFAULT_MAX_RETRIES - if timeout_multiplier is None: - timeout_multiplier = self.DEFAULT_TIMEOUT_MULTIPLIER - if url is None: - raise ValueError("url was None") + self._ack_timeout = ack_timeout or self.DEFAULT_ACK_TIMEOUT self.url = url self.token = token - self._additional_headers = Headers() + + # Set up ZeroMQ context and socke + self._ack_event: asyncio.Event = asyncio.Event() + self.context = zmq.asyncio.Context() + self.socket = self.context.socket(zmq.DEALER) + self.socket.setsockopt(zmq.LINGER, 0) + if dealer_name is None: + self.dealer_id = f"dispatch-{uuid.uuid4().hex[:8]}" + else: + self.dealer_id = dealer_name + self.socket.setsockopt_string(zmq.IDENTITY, self.dealer_id) + print(f"Created: {self.dealer_id=} {token=} {self._ack_timeout=}") if token is not None: - self._additional_headers["token"] = token - - # Mimics the behavior of the ssl argument when connection to - # websockets. If none is specified it will deduce based on the url, - # if True it will enforce TLS, and if you want to use self signed - # certificates you need to pass an ssl_context with the certificate - # loaded. - self._ssl_context: bool | ssl.SSLContext | None = None - if cert is not None: - self._ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - self._ssl_context.load_verify_locations(cadata=cert) - elif url.startswith("wss"): - self._ssl_context = True - - self._max_retries = max_retries - self._timeout_multiplier = timeout_multiplier - self.websocket: ClientConnection | None = None + client_public, client_secret = zmq.curve_keypair() + self.socket.curve_secretkey = client_secret + self.socket.curve_publickey = client_public + self.socket.curve_serverkey = token.encode("utf-8") + self.loop = new_event_loop() + self._receiver_task = None + + async def connect(self) -> None: + self.socket.connect(self.url) + await self._term_receiver_task() + self._receiver_task = asyncio.create_task(self._receiver()) + try: + await self._send(CONNECT_MSG, retries=1) + except ClientConnectionError: + await self._term_receiver_task() + self.term() + raise + + def send(self, message: str, retries: int | None = None) -> None: + self.loop.run_until_complete(self._send(message, retries)) + + async def process_message(self, msg: str) -> None: + pass + + async def _receiver(self) -> None: + while True: + try: + _, raw_msg = await self.socket.recv_multipart() + if raw_msg == ACK_MSG: + self._ack_event.set() + else: + await self.process_message(raw_msg.decode("utf-8")) + except zmq.ZMQError as exc: + logger.debug( + f"{self.dealer_id} connection to evaluator went down, reconnecting: {exc}" + ) + await asyncio.sleep(1) + self.socket.connect(self.url) - async def get_websocket(self) -> ClientConnection: - return await connect( - self.url, - ssl=self._ssl_context, - additional_headers=self._additional_headers, - open_timeout=self.CONNECTION_TIMEOUT, - ping_timeout=self.CONNECTION_TIMEOUT, - ping_interval=self.CONNECTION_TIMEOUT, - close_timeout=self.CONNECTION_TIMEOUT, - ) + async def _send(self, message: str, retries: int | None = None) -> None: + self._ack_event.clear() - async def _send(self, msg: AnyStr) -> None: - for retry in range(self._max_retries + 1): + backoff = 1 + retries = retries or self.DEFAULT_MAX_RETRIES + while retries >= 0: try: - if self.websocket is None: - self.websocket = await self.get_websocket() - await self.websocket.send(msg) - return - except ConnectionClosedOK as exception: - error_msg = ( - f"Connection closed received from the server {self.url}! " - f" Exception from {type(exception)}: {exception!s}" - ) - raise ClientConnectionClosedOK(error_msg) from exception - except (TimeoutError, InvalidHandshake, InvalidURI, OSError) as exception: - if retry == self._max_retries: - error_msg = ( - f"Not able to establish the " - f"websocket connection {self.url}! Max retries reached!" - " Check for firewall issues." - f" Exception from {type(exception)}: {exception!s}" + await self.socket.send_multipart([b"", message.encode("utf-8")]) + try: + await asyncio.wait_for( + self._ack_event.wait(), timeout=self._ack_timeout ) - raise ClientConnectionError(error_msg) from exception - except ConnectionClosedError as exception: - if retry == self._max_retries: - error_msg = ( - f"Not been able to send the event" - f" to {self.url}! Max retries reached!" - f" Exception from {type(exception)}: {exception!s}" + return + except TimeoutError: + logger.warning( + f"{self.dealer_id} failed to get acknowledgment on the {message}. Resending." ) - raise ClientConnectionError(error_msg) from exception - await asyncio.sleep(0.2 + self._timeout_multiplier * retry) - self.websocket = None - - def send(self, msg: AnyStr) -> None: - self.loop.run_until_complete(self._send(msg)) + except zmq.ZMQError as exc: + logger.debug( + f"{self.dealer_id} connection to evaluator went down, reconnecting: {exc}" + ) + await asyncio.sleep(1) + self.socket.connect(self.url) + except asyncio.CancelledError: + self.term() + raise + + retries -= 1 + if retries > 0: + logger.info(f"Retrying... ({retries} attempts left)") + await asyncio.sleep(backoff) + backoff = min(backoff * 2, 10) # Exponential backoff + raise ClientConnectionError( + f"{self.dealer_id} Failed to send {message=} after {retries=}" + ) diff --git a/src/_ert/forward_model_runner/reporting/event.py b/src/_ert/forward_model_runner/reporting/event.py index 81cbb43e682..8ca50302131 100644 --- a/src/_ert/forward_model_runner/reporting/event.py +++ b/src/_ert/forward_model_runner/reporting/event.py @@ -1,9 +1,9 @@ from __future__ import annotations +import asyncio import logging import queue -import threading -from datetime import datetime, timedelta +import time from pathlib import Path from typing import Final @@ -16,11 +16,7 @@ ForwardModelStepSuccess, event_to_json, ) -from _ert.forward_model_runner.client import ( - Client, - ClientConnectionClosedOK, - ClientConnectionError, -) +from _ert.forward_model_runner.client import Client, ClientConnectionError from _ert.forward_model_runner.reporting.base import Reporter from _ert.forward_model_runner.reporting.message import ( _JOB_EXIT_FAILED_STRING, @@ -32,7 +28,7 @@ Start, ) from _ert.forward_model_runner.reporting.statemachine import StateMachine -from _ert.threading import ErtThread +from _ert.threading import ErtThread, threading logger = logging.getLogger(__name__) @@ -59,14 +55,16 @@ class Event(Reporter): _sentinel: Final = EventSentinel() - def __init__(self, evaluator_url, token=None, cert_path=None): + def __init__( + self, + evaluator_url, + token=None, + ack_timeout=None, + max_retries=None, + finished_event_timeout=None, + ): self._evaluator_url = evaluator_url self._token = token - if cert_path is not None: - with open(cert_path, encoding="utf-8") as f: - self._cert = f.read() - else: - self._cert = None self._statemachine = StateMachine() self._statemachine.add_handler((Init,), self._init_handler) @@ -78,53 +76,51 @@ def __init__(self, evaluator_url, token=None, cert_path=None): self._real_id = None self._event_queue: queue.Queue[events.Event | EventSentinel] = queue.Queue() self._event_publisher_thread = ErtThread(target=self._event_publisher) - self._timeout_timestamp = None - self._timestamp_lock = threading.Lock() - # seconds to timeout the reporter the thread after Finish() was received - self._reporter_timeout = 60 + self._done = threading.Event() + self._ack_timeout = ack_timeout + self._max_retries = max_retries + self._finished_event_timeout = finished_event_timeout or 60 - def stop(self) -> None: + def stop(self): self._event_queue.put(Event._sentinel) - with self._timestamp_lock: - self._timeout_timestamp = datetime.now() + timedelta( - seconds=self._reporter_timeout - ) + self._done.set() if self._event_publisher_thread.is_alive(): self._event_publisher_thread.join() def _event_publisher(self): - logger.debug("Publishing event.") - with Client( - url=self._evaluator_url, - token=self._token, - cert=self._cert, - ) as client: - event = None - while True: - with self._timestamp_lock: - if ( - self._timeout_timestamp is not None - and datetime.now() > self._timeout_timestamp - ): - self._timeout_timestamp = None - break - if event is None: - # if we successfully sent the event we can proceed - # to next one - event = self._event_queue.get() - if event is self._sentinel: - break - try: - client.send(event_to_json(event)) - event = None - except ClientConnectionError as exception: - # Possible intermittent failure, we retry sending the event - logger.error(str(exception)) - except ClientConnectionClosedOK as exception: - # The receiving end has closed the connection, we stop - # sending events - logger.debug(str(exception)) - break + async def publisher(): + async with Client( + url=self._evaluator_url, + token=self._token, + ack_timeout=self._ack_timeout, + ) as client: + event = None + start_time = None + while True: + try: + if self._done.is_set() and start_time is None: + start_time = time.time() + if event is None: + event = self._event_queue.get() + if event is self._sentinel: + break + if ( + start_time + and (time.time() - start_time) + > self._finished_event_timeout + ): + break + await client._send(event_to_json(event), self._max_retries) + event = None + except asyncio.CancelledError: + return + except ClientConnectionError as exc: + logger.error(f"Failed to send event: {exc}") + + try: + asyncio.run(publisher()) + except ClientConnectionError as exc: + raise ClientConnectionError("Couldn't connect to evaluator") from exc def report(self, msg): self._statemachine.transition(msg) @@ -187,7 +183,10 @@ def _job_handler(self, msg: Start | Running | Exited): self._dump_event(event) def _finished_handler(self, _): - self.stop() + self._event_queue.put(Event._sentinel) + self._done.set() + if self._event_publisher_thread.is_alive(): + self._event_publisher_thread.join() def _checksum_handler(self, msg: Checksum): fm_checksum = ForwardModelStepChecksum( diff --git a/src/ert/cli/main.py b/src/ert/cli/main.py index 10f325de1a3..d769d918b3f 100644 --- a/src/ert/cli/main.py +++ b/src/ert/cli/main.py @@ -104,7 +104,10 @@ def run_cli(args: Namespace, plugin_manager: ErtPluginManager | None = None) -> # most unix flavors https://en.wikipedia.org/wiki/Ephemeral_port args.port_range = range(49152, 51819) - evaluator_server_config = EvaluatorServerConfig(custom_port_range=args.port_range) + use_ipc_protocol = model.queue_system == QueueSystem.LOCAL + evaluator_server_config = EvaluatorServerConfig( + custom_port_range=args.port_range, use_ipc_protocol=use_ipc_protocol + ) if model.check_if_runpath_exists(): print( diff --git a/src/ert/ensemble_evaluator/__init__.py b/src/ert/ensemble_evaluator/__init__.py index bcef41a53d8..642199a0926 100644 --- a/src/ert/ensemble_evaluator/__init__.py +++ b/src/ert/ensemble_evaluator/__init__.py @@ -1,6 +1,5 @@ from ._ensemble import LegacyEnsemble as Ensemble from ._ensemble import Realization -from ._wait_for_evaluator import wait_for_evaluator from .config import EvaluatorServerConfig from .evaluator import EnsembleEvaluator from .event import EndEvent, FullSnapshotEvent, SnapshotUpdateEvent @@ -19,5 +18,4 @@ "Realization", "RealizationSnapshot", "SnapshotUpdateEvent", - "wait_for_evaluator", ] diff --git a/src/ert/ensemble_evaluator/_ensemble.py b/src/ert/ensemble_evaluator/_ensemble.py index da9af3aec73..10087770623 100644 --- a/src/ert/ensemble_evaluator/_ensemble.py +++ b/src/ert/ensemble_evaluator/_ensemble.py @@ -6,10 +6,7 @@ from collections.abc import Awaitable, Callable, Sequence from dataclasses import dataclass from functools import partialmethod -from typing import ( - Any, - Protocol, -) +from typing import Any, Protocol from _ert.events import ( Event, @@ -25,13 +22,8 @@ from ert.run_arg import RunArg from ert.scheduler import Scheduler, create_driver -from ._wait_for_evaluator import wait_for_evaluator from .config import EvaluatorServerConfig -from .snapshot import ( - EnsembleSnapshot, - FMStepSnapshot, - RealizationSnapshot, -) +from .snapshot import EnsembleSnapshot, FMStepSnapshot, RealizationSnapshot from .state import ( ENSEMBLE_STATE_CANCELLED, ENSEMBLE_STATE_FAILED, @@ -116,6 +108,7 @@ def __post_init__(self) -> None: self._config: EvaluatorServerConfig | None = None self.snapshot: EnsembleSnapshot = self._create_snapshot() self.status = self.snapshot.status + self._client: Client | None = None if self.snapshot.status: self._status_tracker = _EnsembleStateTracker(self.snapshot.status) else: @@ -198,11 +191,10 @@ async def send_event( url: str, event: Event, token: str | None = None, - cert: str | bytes | None = None, retries: int = 10, ) -> None: - async with Client(url, token, cert, max_retries=retries) as client: - await client._send(event_to_json(event)) + async with Client(url, token) as client: + await client._send(event_to_json(event), retries) def generate_event_creator(self) -> Callable[[Id.ENSEMBLE_TYPES], Event]: def event_builder(status: str) -> Event: @@ -227,21 +219,18 @@ async def evaluate( ce_unary_send_method_name, partialmethod( self.__class__.send_event, - self._config.dispatch_uri, + self._config.get_connection_info().router_uri, token=self._config.token, - cert=self._config.cert, ), ) - await wait_for_evaluator( - base_url=self._config.url, - token=self._config.token, - cert=self._config.cert, - ) - await self._evaluate_inner( - event_unary_send=getattr(self, ce_unary_send_method_name), - scheduler_queue=scheduler_queue, - manifest_queue=manifest_queue, - ) + try: + await self._evaluate_inner( + event_unary_send=getattr(self, ce_unary_send_method_name), + scheduler_queue=scheduler_queue, + manifest_queue=manifest_queue, + ) + except asyncio.CancelledError: + print("Cancelling evaluator task!") async def _evaluate_inner( # pylint: disable=too-many-branches self, @@ -279,8 +268,7 @@ async def _evaluate_inner( # pylint: disable=too-many-branches max_running=self._queue_config.max_running, submit_sleep=self._queue_config.submit_sleep, ens_id=self.id_, - ee_uri=self._config.dispatch_uri, - ee_cert=self._config.cert, + ee_uri=self._config.get_connection_info().router_uri, ee_token=self._config.token, ) logger.info( diff --git a/src/ert/ensemble_evaluator/_wait_for_evaluator.py b/src/ert/ensemble_evaluator/_wait_for_evaluator.py index f97fb758a6b..3073fa7bdf5 100644 --- a/src/ert/ensemble_evaluator/_wait_for_evaluator.py +++ b/src/ert/ensemble_evaluator/_wait_for_evaluator.py @@ -1,9 +1,5 @@ -import asyncio import logging import ssl -import time - -import aiohttp logger = logging.getLogger(__name__) @@ -16,62 +12,3 @@ def get_ssl_context(cert: str | bytes | None) -> ssl.SSLContext | bool: ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) ssl_context.load_verify_locations(cadata=cert) return ssl_context - - -async def attempt_connection( - url: str, - token: str | None = None, - cert: str | bytes | None = None, - connection_timeout: float = 2, -) -> None: - timeout = aiohttp.ClientTimeout(connect=connection_timeout) - headers = {} if token is None else {"token": token} - async with ( - aiohttp.ClientSession() as session, - session.request( - method="get", - url=url, - ssl=get_ssl_context(cert), - headers=headers, - timeout=timeout, - ) as resp, - ): - resp.raise_for_status() - - -async def wait_for_evaluator( - base_url: str, - token: str | None = None, - cert: str | bytes | None = None, - healthcheck_endpoint: str = "/healthcheck", - timeout: float | None = None, # noqa: ASYNC109 - connection_timeout: float = 2, -) -> None: - if timeout is None: - timeout = WAIT_FOR_EVALUATOR_TIMEOUT - healthcheck_url = base_url + healthcheck_endpoint - start = time.time() - sleep_time = 0.2 - sleep_time_max = 5.0 - while time.time() - start < timeout: - try: - await attempt_connection( - url=healthcheck_url, - token=token, - cert=cert, - connection_timeout=connection_timeout, - ) - return - except aiohttp.ClientError: - sleep_time = min(sleep_time_max, sleep_time * 2) - remaining_time = max(0, timeout - (time.time() - start) + 0.1) - await asyncio.sleep(min(sleep_time, remaining_time)) - - # We have timed out, but we make one last attempt to ensure that - # we have tried to connect at both ends of the time window - await attempt_connection( - url=healthcheck_url, - token=token, - cert=cert, - connection_timeout=connection_timeout, - ) diff --git a/src/ert/ensemble_evaluator/config.py b/src/ert/ensemble_evaluator/config.py index 51b059a6ce1..1d7a2c2f4f0 100644 --- a/src/ert/ensemble_evaluator/config.py +++ b/src/ert/ensemble_evaluator/config.py @@ -1,14 +1,13 @@ import ipaddress import logging import os -import pathlib import socket -import ssl -import tempfile +import uuid import warnings from base64 import b64encode from datetime import UTC, datetime, timedelta +import zmq from cryptography import x509 from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import hashes, serialization @@ -95,54 +94,32 @@ def _generate_certificate( class EvaluatorServerConfig: - """ - This class is responsible for identifying a host:port-combo and then provide - low-level sockets bound to said combo. The problem is that these sockets may - be closed by underlying code, while the EvaluatorServerConfig-instance is - still alive and expected to provide a bound low-level socket. Thus we risk - that the host:port is hijacked by another process in the meantime. - - To prevent this, we keep a handle to the bound socket and every time - a socket is requested we return a duplicate of this. The duplicate will be - bound similarly to the handle, but when closed the handle stays open and - holds the port. - - In particular, the websocket-server closes the websocket when exiting a - context: - - https://github.com/aaugustin/websockets/blob/c439f1d52aafc05064cc11702d1c3014046799b0/src/websockets/legacy/server.py#L890 - - and digging into the cpython-implementation of asyncio, we see that causes - the asyncio code to also close the underlying socket: - - https://github.com/python/cpython/blob/b34dd58fee707b8044beaf878962a6fa12b304dc/Lib/asyncio/selector_events.py#L607-L611 - - """ - def __init__( self, custom_port_range: range | None = None, use_token: bool = True, - generate_cert: bool = True, custom_host: str | None = None, + use_ipc_protocol: bool = True, ) -> None: - self._socket_handle = find_available_socket( - custom_range=custom_port_range, custom_host=custom_host - ) - host, port = self._socket_handle.getsockname() - self.protocol = "wss" if generate_cert else "ws" - self.url = f"{self.protocol}://{host}:{port}" - self.client_uri = f"{self.url}/client" - self.dispatch_uri = f"{self.url}/dispatch" - if generate_cert: - cert, key, pw = _generate_certificate(host) - else: - cert, key, pw = None, None, None - self.cert = cert - self._key: bytes | None = key - self._key_pw = pw - - self.token = _generate_authentication() if use_token else None + self.host: str | None = None + self.router_port: int | None = None + self.url = f"ipc:///tmp/socket-{uuid.uuid4().hex[:8]}" + self.token: str | None = None + + self.server_public_key: bytes | None = None + self.server_secret_key: bytes | None = None + if not use_ipc_protocol: + self._socket_handle = find_available_socket( + custom_range=custom_port_range, + custom_host=custom_host, + will_close_then_reopen_socket=True, + ) + self.host, self.router_port = self._socket_handle.getsockname() + self.url = f"tcp://{self.host}:{self.router_port}" + + if use_token: + self.server_public_key, self.server_secret_key = zmq.curve_keypair() + self.token = self.server_public_key.decode("utf-8") def get_socket(self) -> socket.socket: return self._socket_handle.dup() @@ -150,25 +127,5 @@ def get_socket(self) -> socket.socket: def get_connection_info(self) -> EvaluatorConnectionInfo: return EvaluatorConnectionInfo( self.url, - self.cert, self.token, ) - - def get_server_ssl_context( - self, protocol: int = ssl.PROTOCOL_TLS_SERVER - ) -> ssl.SSLContext | None: - if self.cert is None: - return None - with tempfile.TemporaryDirectory() as tmp_dir: - tmp_path = pathlib.Path(tmp_dir) - cert_path = tmp_path / "ee.crt" - with open(cert_path, "w", encoding="utf-8") as filehandle_1: - filehandle_1.write(self.cert) - - key_path = tmp_path / "ee.key" - if self._key is not None: - with open(key_path, "wb") as filehandle_2: - filehandle_2.write(self._key) - context = ssl.SSLContext(protocol=protocol) - context.load_cert_chain(cert_path, key_path, self._key_pw) - return context diff --git a/src/ert/ensemble_evaluator/evaluator.py b/src/ert/ensemble_evaluator/evaluator.py index 2e43ad030df..30915aa6c9f 100644 --- a/src/ert/ensemble_evaluator/evaluator.py +++ b/src/ert/ensemble_evaluator/evaluator.py @@ -1,26 +1,13 @@ +from __future__ import annotations + import asyncio import datetime import logging import traceback -from collections.abc import ( - AsyncIterator, - Awaitable, - Callable, - Generator, - Iterable, - Sequence, -) -from contextlib import asynccontextmanager, contextmanager -from http import HTTPStatus -from typing import ( - Any, - get_args, -) +from collections.abc import Awaitable, Callable, Iterable, Sequence +from typing import Any, get_args -from pydantic_core._pydantic_core import ValidationError -from websockets.asyncio.server import ServerConnection, serve -from websockets.exceptions import ConnectionClosedError -from websockets.http11 import Request, Response +import zmq.asyncio from _ert.events import ( EESnapshot, @@ -40,6 +27,7 @@ event_from_json, event_to_json, ) +from _ert.forward_model_runner.client import ACK_MSG, CONNECT_MSG, DISCONNECT_MSG from ert.ensemble_evaluator import identifiers as ids from ._ensemble import FMStepSnapshot @@ -64,15 +52,11 @@ def __init__(self, ensemble: Ensemble, config: EvaluatorServerConfig): self._loop: asyncio.AbstractEventLoop | None = None - self._clients: set[ServerConnection] = set() - self._dispatchers_connected: asyncio.Queue[None] = asyncio.Queue() - self._events: asyncio.Queue[Event] = asyncio.Queue() self._events_to_send: asyncio.Queue[Event] = asyncio.Queue() self._manifest_queue: asyncio.Queue[Any] = asyncio.Queue() self._ee_tasks: list[asyncio.Task[None]] = [] - self._server_started: asyncio.Event = asyncio.Event() self._server_done: asyncio.Event = asyncio.Event() # batching section @@ -82,14 +66,20 @@ def __init__(self, ensemble: Ensemble, config: EvaluatorServerConfig): self._max_batch_size: int = 500 self._batching_interval: float = 2.0 self._complete_batch: asyncio.Event = asyncio.Event() + self._server_started: asyncio.Event = asyncio.Event() + self._clients_connected: set[bytes] = set() + self._clients_empty: asyncio.Event = asyncio.Event() + self._dispatchers_connected: set[bytes] = set() + self._dispatchers_empty: asyncio.Event = asyncio.Event() async def _publisher(self) -> None: + await self._server_started.wait() while True: event = await self._events_to_send.get() - await asyncio.gather( - *[client.send(event_to_json(event)) for client in self._clients], - return_exceptions=True, - ) + for identity in self._clients_connected: + await self._router_socket.send_multipart( + [identity, b"", event_to_json(event).encode("utf-8")] + ) self._events_to_send.task_done() async def _append_message(self, snapshot_update_event: EnsembleSnapshot) -> None: @@ -204,140 +194,136 @@ async def _failed_handler(self, events: Sequence[EnsembleFailed]) -> None: def ensemble(self) -> Ensemble: return self._ensemble - @contextmanager - def store_client(self, websocket: ServerConnection) -> Generator[None, None, None]: - self._clients.add(websocket) - yield - self._clients.remove(websocket) - - async def handle_client(self, websocket: ServerConnection) -> None: - with self.store_client(websocket): + async def handle_client(self, dealer: bytes, frame: bytes) -> None: + raw_msg = frame.decode("utf-8") + if raw_msg == CONNECT_MSG: + self._clients_connected.add(dealer) + self._clients_empty.clear() current_snapshot_dict = self._ensemble.snapshot.to_dict() event: Event = EESnapshot( - snapshot=current_snapshot_dict, ensemble=self.ensemble.id_ + snapshot=current_snapshot_dict, + ensemble=self.ensemble.id_, ) - await websocket.send(event_to_json(event)) - - async for raw_msg in websocket: - event = event_from_json(raw_msg) - logger.debug(f"got message from client: {event}") - if type(event) is EEUserCancel: - logger.debug(f"Client {websocket.remote_address} asked to cancel.") - self._signal_cancel() - - elif type(event) is EEUserDone: - logger.debug(f"Client {websocket.remote_address} signalled done.") - self.stop() - - @asynccontextmanager - async def count_dispatcher(self) -> AsyncIterator[None]: - await self._dispatchers_connected.put(None) - yield - await self._dispatchers_connected.get() - self._dispatchers_connected.task_done() - - async def handle_dispatch(self, websocket: ServerConnection) -> None: - async with self.count_dispatcher(): - try: - async for raw_msg in websocket: - try: - event = dispatch_event_from_json(raw_msg) - if event.ensemble != self.ensemble.id_: - logger.info( - "Got event from evaluator " - f"{event.ensemble}. " - f"Ignoring since I am {self.ensemble.id_}" - ) - continue - if type(event) is ForwardModelStepChecksum: - await self.forward_checksum(event) - else: - await self._events.put(event) - except ValidationError as ex: - logger.warning( - "cannot handle event - " - f"closing connection to dispatcher: {ex}" - ) - await websocket.close( - code=1011, reason=f"failed handling message {raw_msg!r}" - ) - return - - if type(event) in [EnsembleSucceeded, EnsembleFailed]: - return - except ConnectionClosedError as connection_error: - # Dispatchers may close the connection abruptly in the case of - # * flaky network (then the dispatcher will try to reconnect) - # * job being killed due to MAX_RUNTIME - # * job being killed by user - logger.error( - f"a dispatcher abruptly closed a websocket: {connection_error!s}" + await self._router_socket.send_multipart( + [dealer, b"", event_to_json(event).encode("utf-8")] + ) + elif raw_msg == DISCONNECT_MSG: + self._clients_connected.discard(dealer) + if not self._clients_connected: + self._clients_empty.set() + else: + event = event_from_json(raw_msg) + if type(event) is EEUserCancel: + logger.debug("Client asked to cancel.") + self._signal_cancel() + elif type(event) is EEUserDone: + logger.debug("Client signalled done.") + self.stop() + + async def handle_dispatch(self, dealer: bytes, frame: bytes) -> None: + raw_msg = frame.decode("utf-8") + if raw_msg == CONNECT_MSG: + self._dispatchers_connected.add(dealer) + self._dispatchers_empty.clear() + elif raw_msg == DISCONNECT_MSG: + self._dispatchers_connected.discard(dealer) + if not self._dispatchers_connected: + self._dispatchers_empty.set() + else: + event = dispatch_event_from_json(raw_msg) + if event.ensemble != self.ensemble.id_: + logger.info( + "Got event from evaluator " + f"{event.ensemble}. " + f"Ignoring since I am {self.ensemble.id_}" ) + return + if type(event) is ForwardModelStepChecksum: + await self.forward_checksum(event) + else: + await self._events.put(event) + + async def listen_for_messages(self) -> None: + await self._server_started.wait() + while True: + try: + dealer, _, frame = await self._router_socket.recv_multipart() + await self._router_socket.send_multipart([dealer, b"", ACK_MSG]) + sender = dealer.decode("utf-8") + if sender.startswith("client"): + await self.handle_client(dealer, frame) + elif sender.startswith("dispatch"): + await self.handle_dispatch(dealer, frame) + else: + logger.info(f"Connection attempt to unknown sender: {sender}.") + except zmq.error.ZMQError as e: + if e.errno == zmq.ENOTSOCK: + logger.warning( + "Evaluator receiver closed, no new messages are received" + ) + else: + logger.error(f"Unexpected error when listening to messages: {e}") + except asyncio.CancelledError: + self._router_socket.close() + return async def forward_checksum(self, event: Event) -> None: # clients still need to receive events via ws await self._events_to_send.put(event) await self._manifest_queue.put(event) - async def connection_handler(self, websocket: ServerConnection) -> None: - if websocket.request is not None: - path = websocket.request.path - elements = path.split("/") - if elements[1] == "client": - await self.handle_client(websocket) - elif elements[1] == "dispatch": - await self.handle_dispatch(websocket) - else: - logger.info(f"Connection attempt to unknown path: {path}.") - else: - logger.info("No request to handle.") - - async def process_request( - self, connection: ServerConnection, request: Request - ) -> Response | None: - if request.headers.get("token") != self._config.token: - return connection.respond(HTTPStatus.UNAUTHORIZED, "") - if request.path == "/healthcheck": - return connection.respond(HTTPStatus.OK, "") - return None - async def _server(self) -> None: - async with serve( - self.connection_handler, - sock=self._config.get_socket(), - ssl=self._config.get_server_ssl_context(), - process_request=self.process_request, - max_size=2**26, - ping_timeout=60, - ping_interval=60, - close_timeout=60, - ) as server: + zmq_context = zmq.asyncio.Context() + try: + print("INIT ZMQ ...") + # Create and configure the ROUTER socket + self._router_socket: zmq.asyncio.Socket = zmq_context.socket(zmq.ROUTER) + self._router_socket.setsockopt(zmq.LINGER, 0) + if self._config.server_public_key and self._config.server_secret_key: + self._router_socket.curve_secretkey = self._config.server_secret_key + self._router_socket.curve_publickey = self._config.server_public_key + self._router_socket.curve_server = True + + # Attempt to bind the ROUTER socket + # self._router_socket.bind(f"tcp://*:{self._config.router_port}") + if self._config.router_port: + self._router_socket.bind(f"tcp://*:{self._config.router_port}") + else: + self._router_socket.bind(self._config.url) self._server_started.set() + print(f"ROUTER listens on {self._config.url}") + except zmq.error.ZMQError as e: + logger.error(f"ZMQ error encountered {e} during evaluator initialization") + print(f"ZMQ error encountered {e} during evaluator initialization") + raise + try: await self._server_done.wait() - server.close(close_connections=False) - if self._dispatchers_connected is not None: - logger.debug( - f"Got done signal. {self._dispatchers_connected.qsize()} " - "dispatchers to disconnect..." + try: + await asyncio.wait_for(self._dispatchers_empty.wait(), timeout=5) + except TimeoutError: + logger.warning( + "Not all dispatchers were disconnected when closing zmq server!" ) - try: # Wait for dispatchers to disconnect - await asyncio.wait_for( - self._dispatchers_connected.join(), timeout=20 - ) - except TimeoutError: - logger.debug("Timed out waiting for dispatchers to disconnect") - else: - logger.debug("Got done signal. No dispatchers connected") - - logger.debug("Sending termination-message to clients...") - await self._events.join() await self._complete_batch.wait() await self._batch_processing_queue.join() event = EETerminated(ensemble=self._ensemble.id_) await self._events_to_send.put(event) await self._events_to_send.join() - logger.debug("Async server exiting.") + try: + await asyncio.wait_for(self._clients_empty.wait(), timeout=5) + except TimeoutError: + logger.warning( + "Not all clients were disconnected when closing zmq server!" + ) + logger.debug("Async server exiting.") + finally: + try: + self._router_socket.close() + zmq_context.destroy() + except Exception as exc: + logger.warning(f"Failed to clean up zmq context {exc}") + logger.info("ZMQ cleanup done!") def stop(self) -> None: self._server_done.set() @@ -370,10 +356,10 @@ async def _start_running(self) -> None: ), asyncio.create_task(self._process_event_buffer(), name="processing_task"), asyncio.create_task(self._publisher(), name="publisher_task"), + asyncio.create_task(self.listen_for_messages(), name="listener_task"), ] - # now we wait for the server to actually start - await self._server_started.wait() + await self._server_started.wait() self._ee_tasks.append( asyncio.create_task( self._ensemble.evaluate( @@ -405,9 +391,11 @@ async def _monitor_and_handle_tasks(self) -> None: raise task_exception elif task.get_name() == "server_task": return - elif task.get_name() == "ensemble_task": + elif task.get_name() == "ensemble_task" or task.get_name() in [ + "ensemble_task", + "listener_task", + ]: timeout = self.CLOSE_SERVER_TIMEOUT - continue else: msg = ( f"Something went wrong, {task.get_name()} is done prematurely!" @@ -433,6 +421,9 @@ async def run_and_get_successful_realizations(self) -> list[int]: try: await self._monitor_and_handle_tasks() finally: + self._server_done.set() + self._clients_empty.set() + self._dispatchers_empty.set() for task in self._ee_tasks: if not task.done(): task.cancel() @@ -442,7 +433,7 @@ async def run_and_get_successful_realizations(self) -> list[int]: result, Exception ): logger.error(str(result)) - raise result + raise RuntimeError(result) from result logger.debug("Evaluator is done") return self._ensemble.get_successful_realizations() diff --git a/src/ert/ensemble_evaluator/evaluator_connection_info.py b/src/ert/ensemble_evaluator/evaluator_connection_info.py index e01326c5c99..ac8ec35ef0c 100644 --- a/src/ert/ensemble_evaluator/evaluator_connection_info.py +++ b/src/ert/ensemble_evaluator/evaluator_connection_info.py @@ -5,18 +5,5 @@ class EvaluatorConnectionInfo: """Read only server-info""" - url: str - cert: str | bytes | None = None + router_uri: str token: str | None = None - - @property - def dispatch_uri(self) -> str: - return f"{self.url}/dispatch" - - @property - def client_uri(self) -> str: - return f"{self.url}/client" - - @property - def result_uri(self) -> str: - return f"{self.url}/result" diff --git a/src/ert/ensemble_evaluator/monitor.py b/src/ert/ensemble_evaluator/monitor.py index d3f549377c6..90e8fc274d8 100644 --- a/src/ert/ensemble_evaluator/monitor.py +++ b/src/ert/ensemble_evaluator/monitor.py @@ -1,13 +1,10 @@ +from __future__ import annotations + import asyncio import logging -import ssl import uuid from collections.abc import AsyncGenerator -from typing import TYPE_CHECKING, Any, Final - -from aiohttp import ClientError -from websockets import ConnectionClosed, Headers -from websockets.asyncio.client import ClientConnection, connect +from typing import TYPE_CHECKING, Final from _ert.events import ( EETerminated, @@ -17,7 +14,7 @@ event_from_json, event_to_json, ) -from ert.ensemble_evaluator._wait_for_evaluator import wait_for_evaluator +from _ert.forward_model_runner.client import Client if TYPE_CHECKING: from ert.ensemble_evaluator.evaluator_connection_info import EvaluatorConnectionInfo @@ -30,60 +27,37 @@ class EventSentinel: pass -class Monitor: +class Monitor(Client): _sentinel: Final = EventSentinel() - def __init__(self, ee_con_info: "EvaluatorConnectionInfo") -> None: - self._ee_con_info = ee_con_info + def __init__(self, ee_con_info: EvaluatorConnectionInfo) -> None: self._id = str(uuid.uuid1()).split("-", maxsplit=1)[0] self._event_queue: asyncio.Queue[Event | EventSentinel] = asyncio.Queue() - self._connection: ClientConnection | None = None - self._receiver_task: asyncio.Task[None] | None = None - self._connected: asyncio.Future[None] = asyncio.Future() - self._connection_timeout: float = 120.0 self._receiver_timeout: float = 60.0 + super().__init__( + ee_con_info.router_uri, + ee_con_info.token, + dealer_name=f"client-{self._id}", + ) - async def __aenter__(self) -> "Monitor": - self._receiver_task = asyncio.create_task(self._receiver()) - try: - await asyncio.wait_for(self._connected, timeout=self._connection_timeout) - except TimeoutError as exc: - msg = "Couldn't establish connection with the ensemble evaluator!" - logger.error(msg) - self._receiver_task.cancel() - raise RuntimeError(msg) from exc - return self - - async def __aexit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: - if self._receiver_task: - if not self._receiver_task.done(): - self._receiver_task.cancel() - # we are done and not interested in errors when cancelling - await asyncio.gather( - self._receiver_task, - return_exceptions=True, - ) - if self._connection: - await self._connection.close() + async def process_message(self, msg: str) -> None: + event = event_from_json(msg) + await self._event_queue.put(event) async def signal_cancel(self) -> None: - if not self._connection: - return await self._event_queue.put(Monitor._sentinel) logger.debug(f"monitor-{self._id} asking server to cancel...") cancel_event = EEUserCancel(monitor=self._id) - await self._connection.send(event_to_json(cancel_event)) + await self._send(event_to_json(cancel_event)) logger.debug(f"monitor-{self._id} asked server to cancel") async def signal_done(self) -> None: - if not self._connection: - return await self._event_queue.put(Monitor._sentinel) logger.debug(f"monitor-{self._id} informing server monitor is done...") done_event = EEUserDone(monitor=self._id) - await self._connection.send(event_to_json(done_event)) + await self._send(event_to_json(done_event)) logger.debug(f"monitor-{self._id} informed server monitor is done") async def track( @@ -116,45 +90,3 @@ async def track( break if event is not None: self._event_queue.task_done() - - async def _receiver(self) -> None: - tls: ssl.SSLContext | None = None - if self._ee_con_info.cert: - tls = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - tls.load_verify_locations(cadata=self._ee_con_info.cert) - headers = Headers() - if self._ee_con_info.token: - headers["token"] = self._ee_con_info.token - try: - await wait_for_evaluator( - base_url=self._ee_con_info.url, - token=self._ee_con_info.token, - cert=self._ee_con_info.cert, - timeout=5, - ) - except Exception as e: - self._connected.set_exception(e) - return - async for conn in connect( - self._ee_con_info.client_uri, - ssl=tls, - additional_headers=headers, - max_size=2**26, - max_queue=500, - open_timeout=5, - ping_timeout=60, - ping_interval=60, - close_timeout=60, - ): - try: - self._connection = conn - self._connected.set_result(None) - async for raw_msg in self._connection: - event = event_from_json(raw_msg) - await self._event_queue.put(event) - except (ConnectionRefusedError, ConnectionClosed, ClientError) as exc: - self._connection = None - self._connected = asyncio.Future() - logger.debug( - f"Monitor connection to EnsembleEvaluator went down, reconnecting: {exc}" - ) diff --git a/src/ert/gui/simulation/run_dialog.py b/src/ert/gui/simulation/run_dialog.py index 82e76aa0295..508190bae90 100644 --- a/src/ert/gui/simulation/run_dialog.py +++ b/src/ert/gui/simulation/run_dialog.py @@ -346,9 +346,13 @@ def run_experiment(self, restart: bool = False) -> None: self._tab_widget.clear() port_range = None + use_ipc_protocol = False if self._run_model.queue_system == QueueSystem.LOCAL: port_range = range(49152, 51819) - evaluator_server_config = EvaluatorServerConfig(custom_port_range=port_range) + use_ipc_protocol = True + evaluator_server_config = EvaluatorServerConfig( + custom_port_range=port_range, use_ipc_protocol=use_ipc_protocol + ) def run() -> None: self._run_model.start_simulations_thread( diff --git a/src/ert/logging/logger.conf b/src/ert/logging/logger.conf index 012c8366cff..d9959f765f7 100644 --- a/src/ert/logging/logger.conf +++ b/src/ert/logging/logger.conf @@ -33,8 +33,8 @@ loggers: level: INFO subscript: level: INFO - websockets: - level: WARNING + zmq: + level: INFO root: diff --git a/src/ert/run_models/base_run_model.py b/src/ert/run_models/base_run_model.py index a426c5be7a1..dd7c3700df6 100644 --- a/src/ert/run_models/base_run_model.py +++ b/src/ert/run_models/base_run_model.py @@ -18,12 +18,7 @@ import numpy as np -from _ert.events import ( - EESnapshot, - EESnapshotUpdate, - EETerminated, - Event, -) +from _ert.events import EESnapshot, EESnapshotUpdate, EETerminated, Event from ert.analysis import ( AnalysisEvent, AnalysisStatusEvent, @@ -516,7 +511,6 @@ async def run_monitor( event, iteration, ) - if event.snapshot.get(STATUS) in [ ENSEMBLE_STATE_STOPPED, ENSEMBLE_STATE_FAILED, @@ -569,6 +563,7 @@ async def run_ensemble_evaluator_async( evaluator_task = asyncio.create_task( evaluator.run_and_get_successful_realizations() ) + await evaluator._server_started.wait() if not (await self.run_monitor(ee_config, ensemble.iteration)): return [] diff --git a/src/ert/scheduler/scheduler.py b/src/ert/scheduler/scheduler.py index a1610930b26..6495ec28052 100644 --- a/src/ert/scheduler/scheduler.py +++ b/src/ert/scheduler/scheduler.py @@ -9,7 +9,6 @@ from collections.abc import Iterable, MutableMapping, Sequence from contextlib import suppress from dataclasses import asdict -from pathlib import Path from typing import TYPE_CHECKING, Any import orjson @@ -17,7 +16,6 @@ from _ert.async_utils import get_running_loop from _ert.events import Event, ForwardModelStepChecksum, Id, event_from_dict -from ert.constant_filenames import CERT_FILE from .driver import Driver from .event import FinishedEvent, StartedEvent @@ -35,7 +33,6 @@ class _JobsJson: real_id: int dispatch_url: str | None ee_token: str | None - ee_cert_path: str | None experiment_id: str | None @@ -69,7 +66,6 @@ def __init__( submit_sleep: float = 0.0, ens_id: str | None = None, ee_uri: str | None = None, - ee_cert: str | None = None, ee_token: str | None = None, ) -> None: self.driver = driver @@ -103,7 +99,6 @@ def __init__( self._max_running = max_running self._ee_uri = ee_uri self._ens_id = ens_id - self._ee_cert = ee_cert self._ee_token = ee_token self.checksum: dict[str, dict[str, Any]] = {} @@ -330,22 +325,12 @@ async def _process_event_queue(self) -> None: job.returncode.set_result(event.returncode) def _update_jobs_json(self, iens: int, runpath: str) -> None: - cert_path = f"{runpath}/{CERT_FILE}" - try: - if self._ee_cert is not None: - Path(cert_path).write_text(self._ee_cert, encoding="utf-8") - except OSError as err: - error_msg = f"Could not write ensemble certificate: {err}" - self._jobs[iens].unschedule(error_msg) - logger.error(error_msg) - return jobs = _JobsJson( experiment_id=None, ens_id=self._ens_id, real_id=iens, dispatch_url=self._ee_uri, ee_token=self._ee_token, - ee_cert_path=cert_path if self._ee_cert is not None else None, ) jobs_path = os.path.join(runpath, "jobs.json") try: diff --git a/src/ert/shared/net_utils.py b/src/ert/shared/net_utils.py index fbcb80cebf7..2c377c2aca5 100644 --- a/src/ert/shared/net_utils.py +++ b/src/ert/shared/net_utils.py @@ -110,6 +110,7 @@ def _bind_socket( if will_close_then_reopen_socket: sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) else: sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 0) diff --git a/tests/ert/conftest.py b/tests/ert/conftest.py index 8d8d5f528b7..131e32c2925 100644 --- a/tests/ert/conftest.py +++ b/tests/ert/conftest.py @@ -475,8 +475,8 @@ class MockESConfig(EvaluatorServerConfig): def __init__(self, *args, **kwargs): if "use_token" not in kwargs: kwargs["use_token"] = False - if "generate_cert" not in kwargs: - kwargs["generate_cert"] = False + if sys.platform != "linux": + kwargs["use_ipc_protocol"] = True super().__init__(*args, **kwargs) monkeypatch.setattr("ert.cli.main.EvaluatorServerConfig", MockESConfig) diff --git a/tests/ert/ui_tests/cli/test_cli.py b/tests/ert/ui_tests/cli/test_cli.py index 91b6fc7b843..1d6f361bbdc 100644 --- a/tests/ert/ui_tests/cli/test_cli.py +++ b/tests/ert/ui_tests/cli/test_cli.py @@ -12,21 +12,17 @@ import numpy as np import pandas as pd import pytest -import websockets.exceptions import xtgeo +import zmq from psutil import NoSuchProcess, Popen, Process, ZombieProcess from resdata.summary import Summary import _ert.threading import ert.shared from _ert.forward_model_runner.client import Client -from ert import LibresFacade, ensemble_evaluator +from ert import LibresFacade from ert.cli.main import ErtCliError -from ert.config import ( - ConfigValidationError, - ConfigWarning, - ErtConfig, -) +from ert.config import ConfigValidationError, ConfigWarning, ErtConfig from ert.enkf_main import sample_prior from ert.ensemble_evaluator import EnsembleEvaluator from ert.mode_definitions import ( @@ -106,9 +102,6 @@ def test_that_the_cli_raises_exceptions_when_no_weight_provided_for_es_mda(): @pytest.mark.usefixtures("copy_snake_oil_field") def test_field_init_file_not_readable(monkeypatch): - monkeypatch.setattr( - ensemble_evaluator._wait_for_evaluator, "WAIT_FOR_EVALUATOR_TIMEOUT", 5 - ) config_file_name = "snake_oil_field.ert" field_file_rel_path = "fields/permx0.grdecl" os.chmod(field_file_rel_path, 0x0) @@ -197,10 +190,12 @@ def test_that_the_model_raises_exception_if_successful_realizations_less_than_mi else: fout.write(line) fout.write( - dedent(""" + dedent( + """ INSTALL_JOB failing_fm FAILING_FM FORWARD_MODEL failing_fm - """) + """ + ) ) Path("FAILING_FM").write_text("EXECUTABLE failing_fm.py", encoding="utf-8") Path("failing_fm.py").write_text( @@ -957,14 +952,13 @@ def test_tracking_missing_ecl(monkeypatch, tmp_path, caplog): def test_that_connection_errors_do_not_effect_final_result( monkeypatch: pytest.MonkeyPatch, ): - monkeypatch.setattr(Client, "DEFAULT_MAX_RETRIES", 0) - monkeypatch.setattr(Client, "DEFAULT_TIMEOUT_MULTIPLIER", 0) - monkeypatch.setattr(Client, "CONNECTION_TIMEOUT", 1) + monkeypatch.setattr(Client, "DEFAULT_MAX_RETRIES", 1) + monkeypatch.setattr(Client, "DEFAULT_ACK_TIMEOUT", 1) monkeypatch.setattr(EnsembleEvaluator, "CLOSE_SERVER_TIMEOUT", 0.01) monkeypatch.setattr(Job, "DEFAULT_CHECKSUM_TIMEOUT", 0) def raise_connection_error(*args, **kwargs): - raise websockets.exceptions.ConnectionClosedError(None, None) + raise zmq.error.ZMQError(None, None) with patch( "ert.ensemble_evaluator.evaluator.dispatch_event_from_json", diff --git a/tests/ert/unit_tests/ensemble_evaluator/ensemble_evaluator_utils.py b/tests/ert/unit_tests/ensemble_evaluator/ensemble_evaluator_utils.py index 3088ed0a131..8e64fcdf12a 100644 --- a/tests/ert/unit_tests/ensemble_evaluator/ensemble_evaluator_utils.py +++ b/tests/ert/unit_tests/ensemble_evaluator/ensemble_evaluator_utils.py @@ -1,36 +1,8 @@ -import asyncio - -import websockets - -from _ert.async_utils import new_event_loop from ert.config import QueueConfig from ert.ensemble_evaluator import Ensemble from ert.ensemble_evaluator._ensemble import ForwardModelStep, Realization -def _mock_ws(host, port, messages, delay_startup=0): - loop = new_event_loop() - done = loop.create_future() - - async def _handler(websocket): - while True: - msg = await websocket.recv() - messages.append(msg) - if msg == "stop": - done.set_result(None) - break - - async def _run_server(): - await asyncio.sleep(delay_startup) - async with websockets.server.serve( - _handler, host, port, ping_timeout=1, ping_interval=1 - ): - await done - - loop.run_until_complete(_run_server()) - loop.close() - - class TestEnsemble(Ensemble): __test__ = False diff --git a/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_client.py b/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_client.py index 6b6fc294530..3a87cdac364 100644 --- a/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_client.py +++ b/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_client.py @@ -1,68 +1,48 @@ -from functools import partial - import pytest from _ert.forward_model_runner.client import Client, ClientConnectionError -from _ert.threading import ErtThread - -from .ensemble_evaluator_utils import _mock_ws +from tests.ert.utils import MockZMQServer @pytest.mark.integration_test -def test_invalid_server(): +async def test_invalid_server(): port = 7777 host = "localhost" - url = f"ws://{host}:{port}" + url = f"tcp://{host}:{port}" - with ( - Client(url, max_retries=2, timeout_multiplier=2) as c1, - pytest.raises(ClientConnectionError), - ): - c1.send("hei") + with pytest.raises(ClientConnectionError): + async with Client(url, ack_timeout=1.0): + pass -def test_successful_sending(unused_tcp_port): +async def test_successful_sending(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" - messages = [] - mock_ws_thread = ErtThread( - target=partial(_mock_ws, messages=messages), args=(host, unused_tcp_port) - ) - - mock_ws_thread.start() - messages_c1 = ["test_1", "test_2", "test_3", "stop"] - - with Client(url) as c1: - for msg in messages_c1: - c1.send(msg) - - mock_ws_thread.join() + url = f"tcp://{host}:{unused_tcp_port}" + messages_c1 = ["test_1", "test_2", "test_3"] + async with MockZMQServer(unused_tcp_port) as mock_server, Client(url) as c1: + for message in messages_c1: + await c1._send(message) for msg in messages_c1: - assert msg in messages + assert msg in mock_server.messages -@pytest.mark.integration_test -def test_retry(unused_tcp_port): +async def test_retry(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" - messages = [] - mock_ws_thread = ErtThread( - target=partial(_mock_ws, messages=messages, delay_startup=2), - args=( - host, - unused_tcp_port, - ), - ) - - mock_ws_thread.start() - messages_c1 = ["test_1", "test_2", "test_3", "stop"] - - with Client(url, max_retries=2, timeout_multiplier=2) as c1: - for msg in messages_c1: - c1.send(msg) - - mock_ws_thread.join() - - for msg in messages_c1: - assert msg in messages + url = f"tcp://{host}:{unused_tcp_port}" + client_connection_error_set = False + messages_c1 = ["test_1", "test_2", "test_3"] + async with ( + MockZMQServer(unused_tcp_port, signal=2) as mock_server, + Client(url, ack_timeout=0.5) as c1, + ): + for message in messages_c1: + try: + await c1._send(message, retries=1) + except ClientConnectionError: + client_connection_error_set = True + mock_server.signal(0) + assert client_connection_error_set + assert mock_server.messages.count("test_1") == 2 + assert mock_server.messages.count("test_2") == 1 + assert mock_server.messages.count("test_3") == 1 diff --git a/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_evaluator.py b/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_evaluator.py index f2a645f6d9d..05bad359c40 100644 --- a/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_evaluator.py +++ b/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_evaluator.py @@ -2,12 +2,10 @@ import datetime from functools import partial from typing import cast -from unittest.mock import MagicMock import pytest from hypothesis import given from hypothesis import strategies as st -from websockets.server import WebSocketServerProtocol from _ert.events import ( EESnapshot, @@ -55,7 +53,10 @@ async def test_when_task_fails_evaluator_raises_exception( async def mock_failure(message, *args, **kwargs): raise RuntimeError(message) - evaluator = EnsembleEvaluator(TestEnsemble(0, 2, 2, id_="0"), make_ee_config()) + evaluator = EnsembleEvaluator( + TestEnsemble(0, 2, 2, id_="0"), make_ee_config(use_token=False) + ) + monkeypatch.setattr( EnsembleEvaluator, task, @@ -65,17 +66,18 @@ async def mock_failure(message, *args, **kwargs): await evaluator.run_and_get_successful_realizations() -async def test_when_dispatch_is_given_invalid_event_the_socket_is_closed( - make_ee_config, -): - evaluator = EnsembleEvaluator(TestEnsemble(0, 2, 2, id_="0"), make_ee_config()) +# TODO refactor this test +# async def test_when_dispatch_is_given_invalid_event_the_socket_is_closed( +# make_ee_config, +# ): +# evaluator = EnsembleEvaluator(TestEnsemble(0, 2, 2, id_="0"), make_ee_config()) - socket = MagicMock(spec=WebSocketServerProtocol) - socket.__aiter__.return_value = ["invalid_json"] - await evaluator.handle_dispatch(socket) - socket.close.assert_called_once_with( - code=1011, reason="failed handling message 'invalid_json'" - ) +# socket = MagicMock(spec=WebSocketServerProtocol) +# socket.__aiter__.return_value = ["invalid_json"] +# await evaluator.handle_dispatch(socket) +# socket.close.assert_called_once_with( +# code=1011, reason="failed handling message 'invalid_json'" +# ) async def test_no_config_raises_valueerror_when_running(): @@ -110,32 +112,34 @@ async def mock_done_prematurely(message, *args, **kwargs): await evaluator.run_and_get_successful_realizations() -async def test_new_connections_are_denied_when_evaluator_is_closing_down( - evaluator_to_use, -): - evaluator = evaluator_to_use +# TODO refactor this test +# async def test_new_connections_are_denied_when_evaluator_is_closing_down( +# evaluator_to_use, +# ): +# evaluator = evaluator_to_use - class TestMonitor(Monitor): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._connection_timeout = 1 +# class TestMonitor(Monitor): +# def __init__(self, *args, **kwargs): +# super().__init__(*args, **kwargs) +# self._connection_timeout = 1 - async def new_connection(): - await evaluator._server_done.wait() - async with TestMonitor(evaluator._config.get_connection_info()): - pass +# async def new_connection(): +# await evaluator._server_done.wait() +# print(f"server done: {evaluator._server_done.is_set()}") +# async with TestMonitor(evaluator._config.get_connection_info()): +# pass - new_connection_task = asyncio.create_task(new_connection()) - evaluator.stop() +# new_connection_task = asyncio.create_task(new_connection()) +# evaluator.stop() - with pytest.raises(RuntimeError): - await new_connection_task +# with pytest.raises(RuntimeError): +# await new_connection_task @pytest.fixture(name="evaluator_to_use") async def evaluator_to_use_fixture(make_ee_config): ensemble = TestEnsemble(0, 2, 2, id_="0") - evaluator = EnsembleEvaluator(ensemble, make_ee_config()) + evaluator = EnsembleEvaluator(ensemble, make_ee_config(use_token=False)) evaluator._batching_interval = 0.5 # batching can be faster for tests run_task = asyncio.create_task(evaluator.run_and_get_successful_realizations()) await evaluator._server_started.wait() @@ -149,8 +153,7 @@ async def evaluator_to_use_fixture(make_ee_config): async def test_restarted_jobs_do_not_have_error_msgs(evaluator_to_use): evaluator = evaluator_to_use token = evaluator._config.token - cert = evaluator._config.cert - url = evaluator._config.url + url = evaluator._config.get_connection_info().router_uri config_info = evaluator._config.get_connection_info() async with Monitor(config_info) as monitor: @@ -161,11 +164,9 @@ async def test_restarted_jobs_do_not_have_error_msgs(evaluator_to_use): assert snapshot.status == ENSEMBLE_STATE_UNKNOWN # two dispatch endpoint clients connect async with Client( - url + "/dispatch", - cert=cert, + url, token=token, - max_retries=1, - timeout_multiplier=1, + dealer_name="dispatch_from_test_1", ) as dispatch: event = ForwardModelStepRunning( ensemble=evaluator.ensemble.id_, @@ -201,11 +202,8 @@ def is_completed_snapshot(snapshot: EnsembleSnapshot) -> bool: break async with Client( - url + "/dispatch", - cert=cert, + url, token=token, - max_retries=1, - timeout_multiplier=1, ) as dispatch: event = ForwardModelStepSuccess( ensemble=evaluator.ensemble.id_, @@ -243,25 +241,18 @@ async def test_new_monitor_can_pick_up_where_we_left_off(evaluator_to_use): evaluator = evaluator_to_use token = evaluator._config.token - cert = evaluator._config.cert - url = evaluator._config.url + url = evaluator._config.get_connection_info().router_uri config_info = evaluator._config.get_connection_info() async with Monitor(config_info) as monitor: async with ( Client( - url + "/dispatch", - cert=cert, + url, token=token, - max_retries=1, - timeout_multiplier=1, ) as dispatch1, Client( - url + "/dispatch", - cert=cert, + url, token=token, - max_retries=1, - timeout_multiplier=1, ) as dispatch2, ): # first dispatch endpoint client informs that forward model 0 is running @@ -318,11 +309,8 @@ def check_if_all_fm_running(snapshot: EnsembleSnapshot) -> bool: # take down first monitor by leaving context async with Client( - url + "/dispatch", - cert=cert, + url, token=token, - max_retries=1, - timeout_multiplier=1, ) as dispatch2: # second dispatch endpoint client informs that job 0 is done event = ForwardModelStepSuccess( @@ -378,9 +366,8 @@ async def test_dispatch_endpoint_clients_can_connect_and_monitor_can_shut_down_e async with Monitor(conn_info) as monitor: events = monitor.track() token = evaluator._config.token - cert = evaluator._config.cert - url = evaluator._config.url + url = conn_info.router_uri # first snapshot before any event occurs snapshot_event = await anext(events) assert type(snapshot_event) is EESnapshot @@ -389,18 +376,12 @@ async def test_dispatch_endpoint_clients_can_connect_and_monitor_can_shut_down_e # two dispatch endpoint clients connect async with ( Client( - url + "/dispatch", - cert=cert, + url, token=token, - max_retries=1, - timeout_multiplier=1, ) as dispatch1, Client( - url + "/dispatch", - cert=cert, + url, token=token, - max_retries=1, - timeout_multiplier=1, ) as dispatch2, ): # first dispatch endpoint client informs that real 0 fm 0 is running @@ -491,12 +472,11 @@ async def test_ensure_multi_level_events_in_order(evaluator_to_use): events = monitor.track() token = evaluator._config.token - cert = evaluator._config.cert - url = evaluator._config.url + url = config_info.router_uri snapshot_event = await anext(events) assert type(snapshot_event) is EESnapshot - async with Client(url + "/dispatch", cert=cert, token=token) as dispatch: + async with Client(url, token=token) as dispatch: event = EnsembleStarted(ensemble=evaluator.ensemble.id_) await dispatch._send(event_to_json(event)) event = RealizationSuccess( diff --git a/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_evaluator_config.py b/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_evaluator_config.py index 0049d6e656c..2741aa4da9e 100644 --- a/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_evaluator_config.py +++ b/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_evaluator_config.py @@ -8,33 +8,30 @@ def test_load_config(unused_tcp_port): serv_config = EvaluatorServerConfig( custom_port_range=fixed_port, custom_host="127.0.0.1", + use_ipc_protocol=False, ) expected_host = "127.0.0.1" expected_port = unused_tcp_port - expected_url = f"wss://{expected_host}:{expected_port}" - expected_client_uri = f"{expected_url}/client" - expected_dispatch_uri = f"{expected_url}/dispatch" + expected_url = f"tcp://{expected_host}:{expected_port}" url = urlparse(serv_config.url) assert url.hostname == expected_host assert url.port == expected_port assert serv_config.url == expected_url - assert serv_config.client_uri == expected_client_uri - assert serv_config.dispatch_uri == expected_dispatch_uri assert serv_config.token is not None - assert serv_config.cert is not None - sock = serv_config.get_socket() - assert sock is not None - assert not sock._closed - sock.close() + # TODO REFACTOR + # sock = serv_config.get_socket() + # assert sock is not None + # assert not sock._closed + # sock.close() - ee_config = EvaluatorServerConfig( - custom_port_range=range(1024, 65535), - custom_host="127.0.0.1", - use_token=False, - generate_cert=False, - ) - sock = ee_config.get_socket() - assert sock is not None - assert not sock._closed - sock.close() + # ee_config = EvaluatorServerConfig( + # custom_port_range=range(1024, 65535), + # custom_host="127.0.0.1", + # use_token=False, + # generate_cert=False, + # ) + # sock = ee_config.get_socket() + # assert sock is not None + # assert not sock._closed + # sock.close() diff --git a/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_legacy.py b/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_legacy.py index 5b845a6e3d8..ea36feae4a4 100644 --- a/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_legacy.py +++ b/tests/ert/unit_tests/ensemble_evaluator/test_ensemble_legacy.py @@ -1,11 +1,9 @@ import asyncio -import contextlib import os from contextlib import asynccontextmanager from unittest.mock import MagicMock import pytest -from websockets.exceptions import ConnectionClosed from _ert.events import EESnapshot, EESnapshotUpdate, EETerminated from ert.config import QueueConfig @@ -44,11 +42,10 @@ async def test_run_legacy_ensemble( custom_port_range=custom_port_range, custom_host="127.0.0.1", use_token=False, - generate_cert=False, ) async with ( evaluator_to_use(ensemble, config) as evaluator, - Monitor(config) as monitor, + Monitor(config.get_connection_info()) as monitor, ): async for event in monitor.track(): if type(event) in ( @@ -80,29 +77,25 @@ async def test_run_and_cancel_legacy_ensemble( custom_port_range=custom_port_range, custom_host="127.0.0.1", use_token=False, - generate_cert=False, ) terminated_event = False async with ( evaluator_to_use(ensemble, config) as evaluator, - Monitor(config) as monitor, + Monitor(config.get_connection_info()) as monitor, ): # on lesser hardware the realizations might be killed by max_runtime # and the ensemble is set to STOPPED monitor._receiver_timeout = 10.0 cancel = True - with contextlib.suppress( - ConnectionClosed - ): # monitor throws some variant of CC if dispatcher dies - async for event in monitor.track(heartbeat_interval=0.1): - # Cancel the ensemble upon the arrival of the first event - if cancel: - await monitor.signal_cancel() - cancel = False - if type(event) is EETerminated: - terminated_event = True + async for event in monitor.track(heartbeat_interval=0.1): + # Cancel the ensemble upon the arrival of the first event + if cancel: + await monitor.signal_cancel() + cancel = False + if type(event) is EETerminated: + terminated_event = True if terminated_event: assert evaluator._ensemble.status == state.ENSEMBLE_STATE_CANCELLED diff --git a/tests/ert/unit_tests/ensemble_evaluator/test_monitor.py b/tests/ert/unit_tests/ensemble_evaluator/test_monitor.py index e4615649c72..04843a5df29 100644 --- a/tests/ert/unit_tests/ensemble_evaluator/test_monitor.py +++ b/tests/ert/unit_tests/ensemble_evaluator/test_monitor.py @@ -1,88 +1,98 @@ import asyncio import logging -from http import HTTPStatus -from typing import NoReturn -from urllib.parse import urlparse import pytest -from websockets.asyncio import server -from websockets.exceptions import ConnectionClosedOK +import zmq +import zmq.asyncio -import ert -import ert.ensemble_evaluator from _ert.events import EEUserCancel, EEUserDone, event_from_json +from _ert.forward_model_runner.client import ( + ACK_MSG, + CONNECT_MSG, + DISCONNECT_MSG, + ClientConnectionError, +) from ert.ensemble_evaluator import Monitor from ert.ensemble_evaluator.config import EvaluatorConnectionInfo -async def _mock_ws( - set_when_done: asyncio.Event, handler, ee_config: EvaluatorConnectionInfo -): - async def process_request(connection, request): - if request.path == "/healthcheck": - return connection.respond(HTTPStatus.OK, "") - - url = urlparse(ee_config.url) - async with server.serve( - handler, url.hostname, url.port, process_request=process_request - ): - await set_when_done.wait() +async def async_zmq_server(port, handler): + zmq_context = zmq.asyncio.Context() # type: ignore + router_socket = zmq_context.socket(zmq.ROUTER) + router_socket.setsockopt(zmq.LINGER, 0) + router_socket.bind(f"tcp://*:{port}") + await handler(router_socket) + router_socket.close() + zmq_context.destroy() async def test_no_connection_established(make_ee_config): ee_config = make_ee_config() monitor = Monitor(ee_config.get_connection_info()) - monitor._connection_timeout = 0.1 - with pytest.raises( - RuntimeError, match="Couldn't establish connection with the ensemble evaluator!" - ): + monitor._ack_timeout = 0.1 + with pytest.raises(ClientConnectionError): async with monitor: pass async def test_immediate_stop(unused_tcp_port): - ee_con_info = EvaluatorConnectionInfo(f"ws://127.0.0.1:{unused_tcp_port}") - - set_when_done = asyncio.Event() - - async def mock_ws_event_handler(websocket): - async for raw_msg in websocket: - event = event_from_json(raw_msg) - assert type(event) is EEUserDone - break - await websocket.close() + ee_con_info = EvaluatorConnectionInfo(f"tcp://127.0.0.1:{unused_tcp_port}") + + connected = False + + async def mock_event_handler(router_socket): + nonlocal connected + while True: + dealer, _, *frames = await router_socket.recv_multipart() + await router_socket.send_multipart([dealer, b"", ACK_MSG]) + dealer = dealer.decode("utf-8") + for frame in frames: + frame = frame.decode("utf-8") + assert dealer.startswith("client-") + if frame == CONNECT_MSG: + connected = True + elif frame == DISCONNECT_MSG: + connected = False + return + else: + event = event_from_json(frame) + assert connected + assert type(event) is EEUserDone websocket_server_task = asyncio.create_task( - _mock_ws(set_when_done, mock_ws_event_handler, ee_con_info) + async_zmq_server(unused_tcp_port, mock_event_handler) ) async with Monitor(ee_con_info) as monitor: + assert connected is True await monitor.signal_done() - set_when_done.set() await websocket_server_task + assert connected is False -async def test_unexpected_close(unused_tcp_port): - ee_con_info = EvaluatorConnectionInfo(f"ws://127.0.0.1:{unused_tcp_port}") +async def test_unexpected_close_after_connection_successful( + monkeypatch, unused_tcp_port +): + ee_con_info = EvaluatorConnectionInfo(f"tcp://127.0.0.1:{unused_tcp_port}") - set_when_done = asyncio.Event() - socket_closed = asyncio.Event() + monkeypatch.setattr(Monitor, "DEFAULT_MAX_RETRIES", 0) + monkeypatch.setattr(Monitor, "DEFAULT_ACK_TIMEOUT", 1) - async def mock_ws_event_handler(websocket): - await websocket.close() - socket_closed.set() + async def mock_event_handler(router_socket): + dealer, _, frame = await router_socket.recv_multipart() + await router_socket.send_multipart([dealer, b"", ACK_MSG]) + dealer = dealer.decode("utf-8") + assert dealer.startswith("client-") + frame = frame.decode("utf-8") + assert frame == CONNECT_MSG + router_socket.close() websocket_server_task = asyncio.create_task( - _mock_ws(set_when_done, mock_ws_event_handler, ee_con_info) + async_zmq_server(unused_tcp_port, mock_event_handler) ) async with Monitor(ee_con_info) as monitor: - # this expects Event send to fail - # but no attempt on resubmitting - # since connection closed via websocket.close - with pytest.raises(ConnectionClosedOK): - await socket_closed.wait() + with pytest.raises(ClientConnectionError): await monitor.signal_done() - set_when_done.set() await websocket_server_task @@ -90,20 +100,33 @@ async def test_that_monitor_track_can_exit_without_terminated_event_from_evaluat unused_tcp_port, caplog ): caplog.set_level(logging.ERROR) - ee_con_info = EvaluatorConnectionInfo(f"ws://127.0.0.1:{unused_tcp_port}") - - set_when_done = asyncio.Event() - - async def mock_ws_event_handler(websocket): - async for raw_msg in websocket: - event = event_from_json(raw_msg) - assert type(event) is EEUserCancel - break - await websocket.close() + ee_con_info = EvaluatorConnectionInfo(f"tcp://127.0.0.1:{unused_tcp_port}") + + connected = False + + async def mock_event_handler(router_socket): + nonlocal connected + while True: + dealer, _, *frames = await router_socket.recv_multipart() + await router_socket.send_multipart([dealer, b"", ACK_MSG]) + dealer = dealer.decode("utf-8") + for frame in frames: + frame = frame.decode("utf-8") + assert dealer.startswith("client-") + if frame == CONNECT_MSG: + connected = True + elif frame == DISCONNECT_MSG: + connected = False + return + else: + event = event_from_json(frame) + assert connected + assert type(event) is EEUserCancel websocket_server_task = asyncio.create_task( - _mock_ws(set_when_done, mock_ws_event_handler, ee_con_info) + async_zmq_server(unused_tcp_port, mock_event_handler) ) + async with Monitor(ee_con_info) as monitor: monitor._receiver_timeout = 0.1 await monitor.signal_cancel() @@ -115,7 +138,6 @@ async def mock_ws_event_handler(websocket): "Evaluator did not send the TERMINATED event!" ) in caplog.messages, "Monitor receiver did not stop!" - set_when_done.set() await websocket_server_task @@ -124,11 +146,18 @@ async def test_that_monitor_can_emit_heartbeats(unused_tcp_port): exit anytime. A heartbeat is a None event. If the heartbeat is never sent, this test function will hang and then timeout.""" - ee_con_info = EvaluatorConnectionInfo(f"ws://127.0.0.1:{unused_tcp_port}") + ee_con_info = EvaluatorConnectionInfo(f"tcp://127.0.0.1:{unused_tcp_port}") + + async def mock_event_handler(router_socket): + while True: + try: + dealer, _, __ = await router_socket.recv_multipart() + await router_socket.send_multipart([dealer, b"", ACK_MSG]) + except asyncio.CancelledError: + break - set_when_done = asyncio.Event() websocket_server_task = asyncio.create_task( - _mock_ws(set_when_done, None, ee_con_info) + async_zmq_server(unused_tcp_port, mock_event_handler) ) async with Monitor(ee_con_info) as monitor: @@ -136,24 +165,6 @@ async def test_that_monitor_can_emit_heartbeats(unused_tcp_port): if event is None: break - set_when_done.set() # shuts down websocket server - await websocket_server_task - - -@pytest.mark.timeout(10) -async def test_that_monitor_will_raise_exception_if_wait_for_evaluator_fails( - monkeypatch, -): - async def mock_failing_wait_for_evaluator(*args, **kwargs) -> NoReturn: - raise ValueError() - - monkeypatch.setattr( - ert.ensemble_evaluator.monitor, - "wait_for_evaluator", - mock_failing_wait_for_evaluator, - ) - ee_con_info = EvaluatorConnectionInfo("") - - with pytest.raises(ValueError): - async with Monitor(ee_con_info): - pass + if not websocket_server_task.done(): + websocket_server_task.cancel() + asyncio.gather(websocket_server_task, return_exceptions=True) diff --git a/tests/ert/unit_tests/ensemble_evaluator/test_scheduler.py b/tests/ert/unit_tests/ensemble_evaluator/test_scheduler.py index 0b981b1cbce..8990b93350f 100644 --- a/tests/ert/unit_tests/ensemble_evaluator/test_scheduler.py +++ b/tests/ert/unit_tests/ensemble_evaluator/test_scheduler.py @@ -25,7 +25,7 @@ async def rename_and_wait(): Path("real_0/test").rename("real_0/job_test_file") async def _run_monitor(): - async with Monitor(config) as monitor: + async with Monitor(config.get_connection_info()) as monitor: async for event in monitor.track(): if type(event) is ForwardModelStepChecksum: # Monitor got the checksum message renaming the file @@ -60,7 +60,6 @@ def create_manifest_file(): custom_port_range=custom_port_range, custom_host="127.0.0.1", use_token=False, - generate_cert=False, ) evaluator = EnsembleEvaluator(ensemble, config) with caplog.at_level(logging.DEBUG): diff --git a/tests/ert/unit_tests/forward_model_runner/test_event_reporter.py b/tests/ert/unit_tests/forward_model_runner/test_event_reporter.py index 0575e78b954..32c62f574b0 100644 --- a/tests/ert/unit_tests/forward_model_runner/test_event_reporter.py +++ b/tests/ert/unit_tests/forward_model_runner/test_event_reporter.py @@ -1,7 +1,5 @@ import os -import sys import time -from unittest.mock import patch import pytest @@ -12,10 +10,6 @@ ForwardModelStepSuccess, event_from_json, ) -from _ert.forward_model_runner.client import ( - ClientConnectionClosedOK, - ClientConnectionError, -) from _ert.forward_model_runner.forward_model_step import ForwardModelStep from _ert.forward_model_runner.reporting import Event from _ert.forward_model_runner.reporting.message import ( @@ -27,7 +21,7 @@ Start, ) from _ert.forward_model_runner.reporting.statemachine import TransitionError -from tests.ert.utils import _mock_ws_thread +from tests.ert.utils import MockZMQServer def _wait_until(condition, timeout, fail_msg): @@ -39,19 +33,18 @@ def _wait_until(condition, timeout, fail_msg): def test_report_with_successful_start_message_argument(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" + url = f"tcp://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) fmstep1 = ForwardModelStep( {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): + with MockZMQServer(unused_tcp_port) as mock_server: reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) reporter.report(Start(fmstep1)) reporter.report(Finish()) - assert len(lines) == 1 - event = event_from_json(lines[0]) + assert len(mock_server.messages) == 1 + event = event_from_json(mock_server.messages[0]) assert type(event) is ForwardModelStepStart assert event.ensemble == "ens_id" assert event.real == "0" @@ -62,15 +55,14 @@ def test_report_with_successful_start_message_argument(unused_tcp_port): def test_report_with_failed_start_message_argument(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" + url = f"tcp://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) fmstep1 = ForwardModelStep( {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): + with MockZMQServer(unused_tcp_port) as mock_server: reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) msg = Start(fmstep1).with_error("massive_failure") @@ -78,67 +70,64 @@ def test_report_with_failed_start_message_argument(unused_tcp_port): reporter.report(msg) reporter.report(Finish()) - assert len(lines) == 2 - event = event_from_json(lines[1]) + assert len(mock_server.messages) == 2 + event = event_from_json(mock_server.messages[1]) assert type(event) is ForwardModelStepFailure assert event.error_msg == "massive_failure" -def test_report_with_successful_exit_message_argument(unused_tcp_port): +async def test_report_with_successful_exit_message_argument(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" + url = f"tcp://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) fmstep1 = ForwardModelStep( {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): + with MockZMQServer(unused_tcp_port) as mock_server: reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) reporter.report(Exited(fmstep1, 0)) reporter.report(Finish().with_error("failed")) - assert len(lines) == 1 - event = event_from_json(lines[0]) + assert len(mock_server.messages) == 1 + event = event_from_json(mock_server.messages[0]) assert type(event) is ForwardModelStepSuccess def test_report_with_failed_exit_message_argument(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" + url = f"tcp://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) fmstep1 = ForwardModelStep( {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): + with MockZMQServer(unused_tcp_port) as mock_server: reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) reporter.report(Exited(fmstep1, 1).with_error("massive_failure")) reporter.report(Finish()) - assert len(lines) == 1 - event = event_from_json(lines[0]) + assert len(mock_server.messages) == 1 + event = event_from_json(mock_server.messages[0]) assert type(event) is ForwardModelStepFailure assert event.error_msg == "massive_failure" def test_report_with_running_message_argument(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" + url = f"tcp://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) fmstep1 = ForwardModelStep( {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): + with MockZMQServer(unused_tcp_port) as mock_server: reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=100, rss=10))) reporter.report(Finish()) - assert len(lines) == 1 - event = event_from_json(lines[0]) + assert len(mock_server.messages) == 1 + event = event_from_json(mock_server.messages[0]) assert type(event) is ForwardModelStepRunning assert event.max_memory_usage == 100 assert event.current_memory_usage == 10 @@ -146,46 +135,42 @@ def test_report_with_running_message_argument(unused_tcp_port): def test_report_only_job_running_for_successful_run(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" + url = f"tcp://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) fmstep1 = ForwardModelStep( {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): + with MockZMQServer(unused_tcp_port) as mock_server: reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=100, rss=10))) reporter.report(Finish()) - assert len(lines) == 1 + assert len(mock_server.messages) == 1 def test_report_with_failed_finish_message_argument(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" + url = f"tcp://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) fmstep1 = ForwardModelStep( {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): + with MockZMQServer(unused_tcp_port) as mock_server: reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=100, rss=10))) reporter.report(Finish().with_error("massive_failure")) - assert len(lines) == 1 + assert len(mock_server.messages) == 1 def test_report_inconsistent_events(unused_tcp_port): host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" + url = f"tcp://{host}:{unused_tcp_port}" reporter = Event(evaluator_url=url) - lines = [] with ( - _mock_ws_thread(host, unused_tcp_port, lines), pytest.raises( TransitionError, match=r"Illegal transition None -> \(MessageType,\)", @@ -194,7 +179,6 @@ def test_report_inconsistent_events(unused_tcp_port): reporter.report(Finish()) -@pytest.mark.integration_test def test_report_with_failed_reporter_but_finished_jobs(unused_tcp_port): # this is to show when the reporter fails ert won't crash nor # staying hanging but instead finishes up the job; @@ -202,134 +186,105 @@ def test_report_with_failed_reporter_but_finished_jobs(unused_tcp_port): # also assert reporter._timeout_timestamp is None # meaning Finish event initiated _timeout and timeout was reached # which then sets _timeout_timestamp=None - mock_send_retry_time = 2 - - def mock_send(msg): - time.sleep(mock_send_retry_time) - raise ClientConnectionError("Sending failed!") host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" - reporter = Event(evaluator_url=url) - reporter._reporter_timeout = 4 - fmstep1 = ForwardModelStep( - {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 - ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): - with patch( - "_ert.forward_model_runner.client.Client.send", lambda x, y: mock_send(y) - ): - reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=100, rss=10))) - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=1100, rss=10))) - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=1100, rss=10))) - # set _stop_timestamp - reporter.report(Finish()) + url = f"tcp://{host}:{unused_tcp_port}" + with MockZMQServer(unused_tcp_port) as mock_server: + reporter = Event( + evaluator_url=url, ack_timeout=2, max_retries=1, finished_event_timeout=2 + ) + fmstep1 = ForwardModelStep( + {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 + ) + + mock_server.signal(1) # prevent router to receive messages + reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) + reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=100, rss=10))) + reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=1100, rss=10))) + reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=1100, rss=10))) + reporter.report(Finish()) if reporter._event_publisher_thread.is_alive(): reporter._event_publisher_thread.join() - # set _stop_timestamp to None only when timer stopped - assert reporter._timeout_timestamp is None - assert len(lines) == 0, "expected 0 Job running messages" + assert reporter._done.is_set() + assert len(mock_server.messages) == 0, "expected 0 Job running messages" -@pytest.mark.integration_test -@pytest.mark.flaky(reruns=5) -@pytest.mark.skipif( - sys.platform.startswith("darwin"), reason="Performance can be flaky" -) def test_report_with_reconnected_reporter_but_finished_jobs(unused_tcp_port): # this is to show when the reporter fails but reconnects # reporter still manages to send events and completes fine # see assert reporter._timeout_timestamp is not None # meaning Finish event initiated _timeout but timeout wasn't reached since # it finished succesfully - mock_send_retry_time = 0.1 - - def send_func(msg): - time.sleep(mock_send_retry_time) - raise ClientConnectionError("Sending failed!") host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" - reporter = Event(evaluator_url=url) - fmstep1 = ForwardModelStep( - {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 - ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): - with patch("_ert.forward_model_runner.client.Client.send") as patched_send: - patched_send.side_effect = send_func - - reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=100, rss=10))) - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=200, rss=10))) - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=300, rss=10))) - - _wait_until( - condition=lambda: patched_send.call_count == 3, - timeout=10, - fail_msg="10 seconds should be sufficient to send three events", - ) - - # reconnect and continue sending events - # set _stop_timestamp - reporter.report(Finish()) - if reporter._event_publisher_thread.is_alive(): - reporter._event_publisher_thread.join() - # set _stop_timestamp was not set to None since the reporter finished on time - assert reporter._timeout_timestamp is not None - assert len(lines) == 3, "expected 3 Job running messages" - - -@pytest.mark.integration_test -def test_report_with_closed_received_exiting_gracefully(unused_tcp_port): - # Whenever the receiver end closes the connection, a ConnectionClosedOK is raised - # The reporter should exit the publisher thread gracefully and not send any - # more events - mock_send_retry_time = 3 - - def mock_send(msg): - time.sleep(mock_send_retry_time) - raise ClientConnectionClosedOK("Connection Closed") + url = f"tcp://{host}:{unused_tcp_port}" + with MockZMQServer(unused_tcp_port) as mock_server: + reporter = Event(evaluator_url=url, ack_timeout=1, max_retries=1) + fmstep1 = ForwardModelStep( + {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 + ) - host = "localhost" - url = f"ws://{host}:{unused_tcp_port}" - reporter = Event(evaluator_url=url) - fmstep1 = ForwardModelStep( - {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 - ) - lines = [] - with _mock_ws_thread(host, unused_tcp_port, lines): + mock_server.signal(1) # prevent router to receive messages reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=100, rss=10))) - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=200, rss=10))) - - # sleep until both Running events have been received - _wait_until( - condition=lambda: len(lines) == 2, - timeout=10, - fail_msg="Should not take 10 seconds to send two events", - ) - - with patch( - "_ert.forward_model_runner.client.Client.send", lambda x, y: mock_send(y) - ): - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=300, rss=10))) - # Make sure the publisher thread exits because it got - # ClientConnectionClosedOK. If it hangs it could indicate that the - # exception is not caught/handled correctly - if reporter._event_publisher_thread.is_alive(): - reporter._event_publisher_thread.join() - - reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=400, rss=10))) + reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=1100, rss=10))) + reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=1100, rss=10))) + mock_server.signal(0) # enable router to receive messages reporter.report(Finish()) - - # set _stop_timestamp was not set to None since the reporter finished on time - assert reporter._timeout_timestamp is not None - - # The Running(fmstep1, 300, 10) is popped from the queue, but never sent. - # The following Running is added to queue along with the sentinel - assert reporter._event_queue.qsize() == 2 - # None of the messages after ClientConnectionClosedOK was raised, has been sent - assert len(lines) == 2, "expected 2 Job running messages" + if reporter._event_publisher_thread.is_alive(): + reporter._event_publisher_thread.join() + assert reporter._done.is_set() + assert len(mock_server.messages) == 3, "expected 3 Job running messages" + + +# REFACTOR maybe we don't this anymore +# @pytest.mark.integration_test +# def test_report_with_closed_received_exiting_gracefully(unused_tcp_port): +# mock_send_retry_time = 3 + +# def mock_send(msg): +# time.sleep(mock_send_retry_time) +# raise ClientConnectionClosedOK("Connection Closed") + +# host = "localhost" +# url = f"tcp://{host}:{unused_tcp_port}" +# reporter = Event(evaluator_url=url) +# fmstep1 = ForwardModelStep( +# {"name": "fmstep1", "stdout": "stdout", "stderr": "stderr"}, 0 +# ) +# lines = [] +# with mock_zmq_thread(unused_tcp_port, lines): +# reporter.report(Init([fmstep1], 1, 19, ens_id="ens_id", real_id=0)) +# reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=100, rss=10))) +# reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=200, rss=10))) + +# # sleep until both Running events have been received +# _wait_until( +# condition=lambda: len(lines) == 2, +# timeout=10, +# fail_msg="Should not take 10 seconds to send two events", +# ) + +# with patch( +# "_ert.forward_model_runner.client.Client.send", +# lambda x, y: mock_send(y), +# ): +# reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=300, rss=10))) +# # Make sure the publisher thread exits because it got +# # ClientConnectionClosedOK. If it hangs it could indicate that the +# # exception is not caught/handled correctly +# if reporter._event_publisher_thread.is_alive(): +# reporter._event_publisher_thread.join() + +# reporter.report(Running(fmstep1, ProcessTreeStatus(max_rss=400, rss=10))) +# reporter.report(Finish()) + +# # set _stop_timestamp was not set to None since the reporter finished on time +# assert reporter._timeout_timestamp is not None + + +# # The Running(fmstep1, 300, 10) is popped from the queue, but never sent. +# # The following Running is added to queue along with the sentinel +# assert reporter._event_queue.qsize() == 2 +# # None of the messages after ClientConnectionClosedOK was raised, has been sent +# assert len(lines) == 2, "expected 2 Job running messages" diff --git a/tests/ert/unit_tests/forward_model_runner/test_job_dispatch.py b/tests/ert/unit_tests/forward_model_runner/test_job_dispatch.py index 95da346f52b..d1f16819930 100644 --- a/tests/ert/unit_tests/forward_model_runner/test_job_dispatch.py +++ b/tests/ert/unit_tests/forward_model_runner/test_job_dispatch.py @@ -23,7 +23,7 @@ from _ert.forward_model_runner.reporting import Event, Interactive from _ert.forward_model_runner.reporting.message import Finish, Init from _ert.threading import ErtThread -from tests.ert.utils import _mock_ws_thread, wait_until +from tests.ert.utils import MockZMQServer, wait_until from .test_event_reporter import _wait_until @@ -302,7 +302,7 @@ def test_retry_of_jobs_json_file_read(unused_tcp_port, tmp_path, monkeypatch, ca jobs_json = json.dumps( { "ens_id": "_id_", - "dispatch_url": f"ws://localhost:{unused_tcp_port}", + "dispatch_url": f"tcp://localhost:{unused_tcp_port}", "jobList": [], } ) @@ -316,7 +316,7 @@ def create_jobs_file_after_lock(): (tmp_path / JOBS_FILE).write_text(jobs_json) lock.release() - with _mock_ws_thread("localhost", unused_tcp_port, []): + with MockZMQServer(unused_tcp_port): thread = ErtThread(target=create_jobs_file_after_lock) thread.start() main(args=["script.py", str(tmp_path)]) @@ -345,9 +345,10 @@ def test_setup_reporters(is_interactive_run, ens_id): @pytest.mark.usefixtures("use_tmpdir") def test_job_dispatch_kills_itself_after_unsuccessful_job(unused_tcp_port): - host = "localhost" port = unused_tcp_port - jobs_json = json.dumps({"ens_id": "_id_", "dispatch_url": f"ws://localhost:{port}"}) + jobs_json = json.dumps( + {"ens_id": "_id_", "dispatch_url": f"tcp://localhost:{port}"} + ) with ( patch("_ert.forward_model_runner.cli.os.killpg") as mock_killpg, @@ -361,7 +362,7 @@ def test_job_dispatch_kills_itself_after_unsuccessful_job(unused_tcp_port): ] mock_getpgid.return_value = 17 - with _mock_ws_thread(host, port, []): + with MockZMQServer(port): main(["script.py"]) mock_killpg.assert_called_with(17, signal.SIGKILL) diff --git a/tests/ert/unit_tests/scheduler/test_scheduler.py b/tests/ert/unit_tests/scheduler/test_scheduler.py index e96074f493b..f366fbdc249 100644 --- a/tests/ert/unit_tests/scheduler/test_scheduler.py +++ b/tests/ert/unit_tests/scheduler/test_scheduler.py @@ -11,7 +11,6 @@ from _ert.events import Id, RealizationFailed, RealizationTimeout from ert.config import QueueConfig -from ert.constant_filenames import CERT_FILE from ert.ensemble_evaluator import Realization from ert.load_status import LoadResult, LoadStatus from ert.run_arg import RunArg @@ -124,10 +123,9 @@ async def kill(): async def test_add_dispatch_information_to_jobs_file( storage, tmp_path: Path, mock_driver ): - test_ee_uri = "ws://test_ee_uri.com/121/" + test_ee_uri = "tcp://test_ee_uri.com/121/" test_ens_id = "test_ens_id121" test_ee_token = "test_ee_token_t0k€n121" - test_ee_cert = "test_ee_cert121.pem" ensemble_size = 10 @@ -144,7 +142,6 @@ async def test_add_dispatch_information_to_jobs_file( realizations=realizations, ens_id=test_ens_id, ee_uri=test_ee_uri, - ee_cert=test_ee_cert, ee_token=test_ee_token, ) @@ -155,15 +152,12 @@ async def test_add_dispatch_information_to_jobs_file( for realization in realizations: job_file_path = Path(realization.run_arg.runpath) / "jobs.json" - cert_file_path = Path(realization.run_arg.runpath) / CERT_FILE content: dict = json.loads(job_file_path.read_text(encoding="utf-8")) assert content["ens_id"] == test_ens_id assert content["real_id"] == realization.iens assert content["dispatch_url"] == test_ee_uri assert content["ee_token"] == test_ee_token - assert content["ee_cert_path"] == str(cert_file_path) assert type(content["jobList"]) == list and len(content["jobList"]) == 0 - assert cert_file_path.read_text(encoding="utf-8") == test_ee_cert @pytest.mark.parametrize("max_submit", [1, 2, 3]) diff --git a/tests/ert/unit_tests/shared/test_port_handler.py b/tests/ert/unit_tests/shared/test_port_handler.py index b06a41d861b..f6de340830e 100644 --- a/tests/ert/unit_tests/shared/test_port_handler.py +++ b/tests/ert/unit_tests/shared/test_port_handler.py @@ -311,6 +311,7 @@ def test_reuse_active_close_nok_ok(unused_tcp_port): assert sock.fileno() != -1 +# This test is disabled because it is not clear if zmq needs it def test_reuse_active_live_nok_nok(unused_tcp_port): """ Executive summary of this test diff --git a/tests/ert/unit_tests/test_tracking.py b/tests/ert/unit_tests/test_tracking.py index ab9dd76d41b..5f42a9fb1e6 100644 --- a/tests/ert/unit_tests/test_tracking.py +++ b/tests/ert/unit_tests/test_tracking.py @@ -188,7 +188,6 @@ def test_tracking( custom_port_range=range(1024, 65535), custom_host="127.0.0.1", use_token=False, - generate_cert=False, ) thread = ErtThread( @@ -279,7 +278,6 @@ def test_setting_env_context_during_run( custom_port_range=range(1024, 65535), custom_host="127.0.0.1", use_token=False, - generate_cert=False, ) queue = Events() model = create_model( @@ -356,7 +354,6 @@ def test_run_information_present_as_env_var_in_fm_context( custom_port_range=range(1024, 65535), custom_host="127.0.0.1", use_token=False, - generate_cert=False, ) queue = Events() model = create_model(ert_config, storage, parsed, queue) diff --git a/tests/ert/utils.py b/tests/ert/utils.py index 732f816f8cd..1665ac4e0a5 100644 --- a/tests/ert/utils.py +++ b/tests/ert/utils.py @@ -3,13 +3,13 @@ import asyncio import contextlib import time -from functools import partial from pathlib import Path from typing import TYPE_CHECKING -import websockets.server +import zmq +import zmq.asyncio -from _ert.forward_model_runner.client import Client +from _ert.forward_model_runner.client import ACK_MSG, CONNECT_MSG, DISCONNECT_MSG from _ert.threading import ErtThread from ert.scheduler.event import FinishedEvent, StartedEvent @@ -61,47 +61,69 @@ def wait_until(func, interval=0.5, timeout=30): ) -def _mock_ws(host, port, messages, delay_startup=0): - loop = asyncio.new_event_loop() - done = loop.create_future() - - async def _handler(websocket, path): +class MockZMQServer: + def __init__(self, port, signal=0): + self.port = port + self.messages = [] + self.value = signal + self.loop = None + self.server_task = None + self.handler_task = None + + def start_event_loop(self): + asyncio.set_event_loop(self.loop) + self.loop.run_until_complete(self.mock_zmq_server()) + + def __enter__(self): + self.loop = asyncio.new_event_loop() + self.thread = ErtThread(target=self.start_event_loop) + self.thread.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.handler_task and not self.handler_task.done(): + self.loop.call_soon_threadsafe(self.handler_task.cancel) + self.thread.join() + self.loop.close() + + async def __aenter__(self): + self.server_task = asyncio.create_task(self.mock_zmq_server()) + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + if not self.server_task.done(): + self.server_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await self.server_task + + async def mock_zmq_server(self): + zmq_context = zmq.asyncio.Context() + self.router_socket = zmq_context.socket(zmq.ROUTER) + self.router_socket.bind(f"tcp://*:{self.port}") + + self.handler_task = asyncio.create_task(self._handler()) + try: + await self.handler_task + finally: + self.router_socket.close() + zmq_context.term() + + def signal(self, value): + self.value = value + + async def _handler(self): while True: - msg = await websocket.recv() - messages.append(msg) - if msg == "stop": - done.set_result(None) + try: + dealer, __, frame = await self.router_socket.recv_multipart() + print(f"{dealer=} {frame=} {self.value=}") + frame = frame.decode("utf-8") + if frame in [CONNECT_MSG, DISCONNECT_MSG] or self.value == 0: + await self.router_socket.send_multipart([dealer, b"", ACK_MSG]) + if frame not in [CONNECT_MSG, DISCONNECT_MSG] and self.value != 1: + self.messages.append(frame) + except asyncio.CancelledError: break - async def _run_server(): - await asyncio.sleep(delay_startup) - async with websockets.server.serve(_handler, host, port): - await done - - loop.run_until_complete(_run_server()) - loop.close() - - -@contextlib.contextmanager -def _mock_ws_thread(host, port, messages): - mock_ws_thread = ErtThread( - target=partial(_mock_ws, messages=messages), - args=( - host, - port, - ), - ) - mock_ws_thread.start() - try: - yield - # Make sure to join the thread even if an exception occurs - finally: - url = f"ws://{host}:{port}" - with Client(url) as client: - client.send("stop") - mock_ws_thread.join() - messages.pop() - async def poll(driver: Driver, expected: set[int], *, started=None, finished=None): """Poll driver until expected realisations finish