Skip to content

Commit

Permalink
Fix license error retry
Browse files Browse the repository at this point in the history
We have observed that there are two kind of events that leads
to license errors

* 100% cpu, which means the server is unresponsive for ~25s
* service restart which takes 2-3 minutes

Therefore, we set the first retry to happen after 1 minute to
hoping that the server gets enough time to get off 100% cpu.

The second retry happens between 3 and 6 minutes later, ensuring
that there is enough time for the server to restart.

There is also a stagger factor added to avoid all retries happening
simultanously. It is set to maximum 25s.
  • Loading branch information
eivindjahren committed Jan 12, 2024
1 parent bb5fa0d commit 99af83b
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 7 deletions.
28 changes: 23 additions & 5 deletions src/ert/shared/share/ert/forward-models/res/script/ecl_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from argparse import ArgumentParser
from collections import namedtuple
from contextlib import contextmanager, suppress
from random import random

import resfo
from ecl_config import EclrunConfig
Expand Down Expand Up @@ -361,9 +362,15 @@ def execEclipse(self, eclrun_config=None):
)
return await_process_tee(process, sys.stdout, log_file)

LICENSE_FAILURE_SLEEP_SECONDS = 25
LICENSE_FAILURE_SLEEP_FACTOR = 60
LICENSE_RETRY_STAGGER_FACTOR = 25

def runEclipse(self, eclrun_config=None, retry=True):
def runEclipse(self, eclrun_config=None, retries_left=2, backoff_sleep=None):
backoff_sleep = (
self.LICENSE_FAILURE_SLEEP_FACTOR
if backoff_sleep is None
else backoff_sleep
)
return_code = self.execEclipse(eclrun_config=eclrun_config)

OK_file = os.path.join(self.run_path, f"{self.base_name}.OK")
Expand All @@ -377,9 +384,20 @@ def runEclipse(self, eclrun_config=None, retry=True):
try:
self.assertECLEND()
except RuntimeError as err:
if "LICENSE FAILURE" in err.args[0] and retry:
time.sleep(self.LICENSE_FAILURE_SLEEP_SECONDS)
self.runEclipse(eclrun_config, retry=False)
if "LICENSE FAILURE" in err.args[0] and retries_left > 0:
time_to_wait = backoff_sleep + int(
random() * self.LICENSE_RETRY_STAGGER_FACTOR
)
sys.stderr.write(
"ECLIPSE failed due to license failure "
f"retrying in {time_to_wait} seconds"
)
time.sleep(time_to_wait)
self.runEclipse(
eclrun_config,
retries_left=retries_left - 1,
backoff_sleep=int(backoff_sleep * (3 + 3 * random())),
)
return
else:
raise err from None
Expand Down
5 changes: 3 additions & 2 deletions tests/unit_tests/shared/share/test_ecl_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,9 +603,10 @@ def test_ecl100_retries_once_on_license_failure(tmp_path, monkeypatch):
econfig = ecl_config.Ecl100Config()
sim = econfig.sim("2015.2")
erun = ecl_run.EclRun(str(case_path), sim)
erun.LICENSE_FAILURE_SLEEP_SECONDS = 1
erun.LICENSE_FAILURE_SLEEP_FACTOR = 1
erun.LICENSE_RETRY_STAGGER_FACTOR = 1

with pytest.raises(RuntimeError, match="LICENSE FAILURE"):
erun.runEclipse()
max_attempts = 2
max_attempts = 3
assert (tmp_path / "mock_log").read_text() == "Called mock\n" * max_attempts

0 comments on commit 99af83b

Please sign in to comment.