Skip to content

Commit

Permalink
Merge pull request #317 from SUNET/bugfix.confirmed_2_failed_after_pu…
Browse files Browse the repository at this point in the history
…sh_ok

Bugfix.confirmed 2 failed after push ok
  • Loading branch information
indy-independence authored Sep 20, 2023
2 parents c855fcf + 33cacd7 commit af4a55e
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 11 deletions.
1 change: 1 addition & 0 deletions docker/api/config/api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ mgmtdomain_reserved_count: 5
mgmtdomain_primary_ip_version: 4
commit_confirmed_mode: 1
commit_confirmed_timeout: 300
commit_confirmed_wait: 1
2 changes: 2 additions & 0 deletions docs/configuration/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ Defines parameters for the API:
(see :ref:`commit_confirm_modes`). Defaults to 1.
- commit_confirmed_timeout: Time to wait before rolling back an unconfirmed commit,
specified in seconds. Defaults to 300.
- commit_confirmed_wait: Time to wait between comitting configuration and checking
that the device is still reachable, specified in seconds. Defaults to 1.

/etc/cnaas-nms/repository.yml
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions src/cnaas_nms/app_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class ApiSettings(BaseSettings):
MGMTDOMAIN_PRIMARY_IP_VERSION: int = 4
COMMIT_CONFIRMED_MODE: int = 1
COMMIT_CONFIRMED_TIMEOUT: int = 300
COMMIT_CONFIRMED_WAIT: int = 1
SETTINGS_OVERRIDE: Optional[dict] = None

@validator("MGMTDOMAIN_PRIMARY_IP_VERSION")
Expand Down Expand Up @@ -90,6 +91,7 @@ def construct_api_settings() -> ApiSettings:
MGMTDOMAIN_PRIMARY_IP_VERSION=config.get("mgmtdomain_primary_ip_version", 4),
COMMIT_CONFIRMED_MODE=config.get("commit_confirmed_mode", 1),
COMMIT_CONFIRMED_TIMEOUT=config.get("commit_confirmed_timeout", 300),
COMMIT_CONFIRMED_WAIT=config.get("commit_confirmed_wait", 1),
SETTINGS_OVERRIDE=config.get("settings_override", None),
)
else:
Expand Down
49 changes: 40 additions & 9 deletions src/cnaas_nms/devicehandler/sync_devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,33 @@ def push_sync_device(
task_args["job_id"] = job_id
task_args["confirm_mode_override"] = confirm_mode
logger.debug("Commit confirm mode for host {}: {} (dry_run: {})".format(task.host.name, confirm_mode, dry_run))
task.run(**task_args)
if confirm_mode != 2:
task.host.close_connection("napalm")
try:
task.run(**task_args)
except Exception as e:
logger.exception("Exception while running task napalm_configure for device {}".format(task.host.name))
raise e
finally:
if confirm_mode != 2:
task.host.close_connection("napalm")
if confirm_mode == 2 and not dry_run:
time.sleep(api_settings.COMMIT_CONFIRMED_WAIT)
try:
task.run(task=napalm_get, getters=["facts"], name="Verify reachability")
except Exception as e:
add_sync_event(task.host.name, "commit_confirm_failed", scheduled_by, job_id)
logger.error(
"Could not reach device {} after commit, rollback in: {}s".format(
task.host.name, api_settings.COMMIT_CONFIRMED_TIMEOUT
)
)
raise e
else:
short_facts = {"fqdn": "unknown"}
try:
short_facts["fqdn"] = task.results[2].result["facts"]["fqdn"]
task.results[2].result["facts"] = short_facts
except Exception:
pass

if task.results[1].diff:
config = task.results[1].host["config"]
Expand Down Expand Up @@ -855,6 +879,7 @@ def sync_devices(
task=push_sync_device,
dry_run=dry_run,
job_id=job_id,
scheduled_by=scheduled_by,
confirm_mode=get_confirm_mode(confirm_mode_override),
)
except Exception as e:
Expand All @@ -881,7 +906,7 @@ def sync_devices(
unchanged_hosts = []
# calculate change impact score
for host, results in nrresult.items():
if host in failed_hosts or len(results) != 3:
if host in failed_hosts or len(results) < 3:
logger.debug("Unable to calculate change score for failed device {}".format(host))
elif results[2].diff:
changed_hosts.append(host)
Expand Down Expand Up @@ -923,6 +948,12 @@ def sync_devices(
remove_sync_events(hostname)
dev.last_seen = datetime.datetime.utcnow()
if not dry_run and get_confirm_mode(confirm_mode_override) != 2:
if failed_hosts:
logger.error(
"One or more devices failed to commit configuration, they will roll back configuration"
" in {}s: {}".format(api_settings.COMMIT_CONFIRMED_TIMEOUT, ", ".join(failed_hosts))
)
time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT)
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)

Expand Down Expand Up @@ -958,18 +989,18 @@ def sync_devices(
f"{total_change_score} is higher than auto-push limit {AUTOPUSH_MAX_SCORE}"
)
elif get_confirm_mode(confirm_mode_override) == 2 and not dry_run:
if not changed_hosts:
logger.info("None of the selected host has any changes (diff), skipping commit-confirm")
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)
elif len(failed_hosts) > 0:
if failed_hosts:
logger.error(
"No confirm job scheduled since one or more devices failed in commitmode 2"
", all devices will rollback in {}s".format(api_settings.COMMIT_CONFIRMED_TIMEOUT)
)
time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT)
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)
elif not changed_hosts:
logger.info("None of the selected host has any changes (diff), skipping commit-confirm")
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)
else:
scheduler = Scheduler()
next_job_id = scheduler.add_onetime_job(
Expand Down
10 changes: 8 additions & 2 deletions src/cnaas_nms/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def loglevel_to_rooms(levelname: str) -> List[str]:

def parse_redis_event(event):
try:
# [stream, [(messageid, {datadict})]
# [stream, [(messageid, {datadict})]]
if event[0] == "events":
return event[1][0][1]
except Exception: # noqa: S110
Expand All @@ -115,13 +115,19 @@ def emit_redis_event(event):
def thread_websocket_events():
redis: StrictRedis
with redis_session() as redis:
last_event = b"$"
while True:
result = redis.xread({"events": b"$"}, count=10, block=200)
result = redis.xread({"events": last_event}, count=10, block=200)
for item in result:
event = parse_redis_event(item)
if not event:
continue
emit_redis_event(event)
try:
# [stream, [(messageid, {datadict})]]
last_event = item[1][0][0]
except Exception: # noqa: S110
last_event = b"$"
if stop_websocket_threads:
break

Expand Down
1 change: 1 addition & 0 deletions src/cnaas_nms/scheduler/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ def add_onetime_job(

kwargs["job_id"] = job_id
kwargs["scheduled_by"] = scheduled_by
kwargs["kwargs"]["scheduled_by"] = scheduled_by
if self.use_mule:
try:
import uwsgi
Expand Down

0 comments on commit af4a55e

Please sign in to comment.