From ffe826b1ea0b13ed67299fc55fd34ce873c1c87d Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Fri, 8 Sep 2023 09:17:08 +0200 Subject: [PATCH 01/11] If commit confirmed mode 2 is used and all devices succeed with push, but one or more devices are unreachable after the commit call succeeded, devices that are reachable after will still get confirmed even though they should get rolled back. Add extra ping check to fix this. --- src/cnaas_nms/devicehandler/sync_devices.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 8c4819b8..31f0b71e 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -11,7 +11,7 @@ from nornir.core import Nornir from nornir.core.task import MultiResult, Result from nornir_jinja2.plugins.tasks import template_file -from nornir_napalm.plugins.tasks import napalm_configure, napalm_get +from nornir_napalm.plugins.tasks import napalm_configure, napalm_get, napalm_ping from nornir_utils.plugins.functions import print_result import cnaas_nms.db.helper @@ -560,6 +560,9 @@ def push_sync_device( task.run(**task_args) if confirm_mode != 2: task.host.close_connection("napalm") + if confirm_mode == 2: + time.sleep(1) + task.run(task=napalm_ping, name="Verify reachability") if task.results[1].diff: config = task.results[1].host["config"] From 66f245adfea72e59cf7e1b96fecc908992757dcc Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Fri, 8 Sep 2023 09:32:27 +0200 Subject: [PATCH 02/11] use get facts since ping is about pinging another external device --- src/cnaas_nms/devicehandler/sync_devices.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 31f0b71e..0da17889 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -11,7 +11,7 @@ from nornir.core import Nornir from nornir.core.task import MultiResult, Result from nornir_jinja2.plugins.tasks import template_file -from nornir_napalm.plugins.tasks import napalm_configure, napalm_get, napalm_ping +from nornir_napalm.plugins.tasks import napalm_configure, napalm_get from nornir_utils.plugins.functions import print_result import cnaas_nms.db.helper @@ -562,7 +562,7 @@ def push_sync_device( task.host.close_connection("napalm") if confirm_mode == 2: time.sleep(1) - task.run(task=napalm_ping, name="Verify reachability") + task.run(task=napalm_get, getters=["facts"], name="Verify reachability") if task.results[1].diff: config = task.results[1].host["config"] From d2a56e9dc994f018fc240bf7ee5e9f4101b93833 Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Fri, 8 Sep 2023 10:05:34 +0200 Subject: [PATCH 03/11] Log error device not reachable after commit --- src/cnaas_nms/devicehandler/sync_devices.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 0da17889..291c0024 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -560,9 +560,22 @@ def push_sync_device( task.run(**task_args) if confirm_mode != 2: task.host.close_connection("napalm") - if confirm_mode == 2: + if confirm_mode == 2 and not dry_run: time.sleep(1) task.run(task=napalm_get, getters=["facts"], name="Verify reachability") + if task.results[2].failed: + logger.error( + "Could not reach device {} after commit, rollback in: {}s".format( + task.host.name, api_settings.COMMIT_CONFIRMED_TIMEOUT + ) + ) + else: + short_facts = {"fqdn": "unknown"} + try: + short_facts["fqdn"] = task.results[2].result["facts"]["fqdn"] + task.results[2].result["facts"] = short_facts + except Exception: + pass if task.results[1].diff: config = task.results[1].host["config"] From 1c6c73695a22eb8ab06b254b12e1934f44fce45f Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Fri, 8 Sep 2023 10:15:50 +0200 Subject: [PATCH 04/11] task.run fails with exception, so catch that instead of checking fail status --- src/cnaas_nms/devicehandler/sync_devices.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 291c0024..5a8d235d 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -562,13 +562,15 @@ def push_sync_device( task.host.close_connection("napalm") if confirm_mode == 2 and not dry_run: time.sleep(1) - task.run(task=napalm_get, getters=["facts"], name="Verify reachability") - if task.results[2].failed: + try: + task.run(task=napalm_get, getters=["facts"], name="Verify reachability") + except Exception as e: logger.error( "Could not reach device {} after commit, rollback in: {}s".format( task.host.name, api_settings.COMMIT_CONFIRMED_TIMEOUT ) ) + raise e else: short_facts = {"fqdn": "unknown"} try: From 97367a9dd9e149c5220d02dd26dccf5815410b2f Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Fri, 8 Sep 2023 10:22:51 +0200 Subject: [PATCH 05/11] Configurable time to wait between comitting configuration and checking that the device is still reachable --- docker/api/config/api.yml | 1 + docs/configuration/index.rst | 2 ++ src/cnaas_nms/app_settings.py | 2 ++ src/cnaas_nms/devicehandler/sync_devices.py | 2 +- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docker/api/config/api.yml b/docker/api/config/api.yml index 32a87fcd..9f52fcf6 100644 --- a/docker/api/config/api.yml +++ b/docker/api/config/api.yml @@ -12,3 +12,4 @@ mgmtdomain_reserved_count: 5 mgmtdomain_primary_ip_version: 4 commit_confirmed_mode: 1 commit_confirmed_timeout: 300 +commit_confirmed_wait: 1 diff --git a/docs/configuration/index.rst b/docs/configuration/index.rst index 6221622b..4ca4dcc4 100644 --- a/docs/configuration/index.rst +++ b/docs/configuration/index.rst @@ -45,6 +45,8 @@ Defines parameters for the API: (see :ref:`commit_confirm_modes`). Defaults to 1. - commit_confirmed_timeout: Time to wait before rolling back an unconfirmed commit, specified in seconds. Defaults to 300. +- commit_confirmed_wait: Time to wait between comitting configuration and checking + that the device is still reachable, specified in seconds. Defaults to 1. /etc/cnaas-nms/repository.yml ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/cnaas_nms/app_settings.py b/src/cnaas_nms/app_settings.py index 4d1622b2..cbc77621 100644 --- a/src/cnaas_nms/app_settings.py +++ b/src/cnaas_nms/app_settings.py @@ -52,6 +52,7 @@ class ApiSettings(BaseSettings): MGMTDOMAIN_PRIMARY_IP_VERSION: int = 4 COMMIT_CONFIRMED_MODE: int = 1 COMMIT_CONFIRMED_TIMEOUT: int = 300 + COMMIT_CONFIRMED_WAIT: int = 1 SETTINGS_OVERRIDE: Optional[dict] = None @validator("MGMTDOMAIN_PRIMARY_IP_VERSION") @@ -90,6 +91,7 @@ def construct_api_settings() -> ApiSettings: MGMTDOMAIN_PRIMARY_IP_VERSION=config.get("mgmtdomain_primary_ip_version", 4), COMMIT_CONFIRMED_MODE=config.get("commit_confirmed_mode", 1), COMMIT_CONFIRMED_TIMEOUT=config.get("commit_confirmed_timeout", 300), + COMMIT_CONFIRMED_WAIT=config.get("commit_confirmed_wait", 1), SETTINGS_OVERRIDE=config.get("settings_override", None), ) else: diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 5a8d235d..76546d33 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -561,7 +561,7 @@ def push_sync_device( if confirm_mode != 2: task.host.close_connection("napalm") if confirm_mode == 2 and not dry_run: - time.sleep(1) + time.sleep(api_settings.COMMIT_CONFIRMED_WAIT) try: task.run(task=napalm_get, getters=["facts"], name="Verify reachability") except Exception as e: From 897cb2f9bfb5add4e83e0ea1676482be6d13b92d Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Fri, 8 Sep 2023 16:04:22 +0200 Subject: [PATCH 06/11] Make sure commit confirm 2 checks for failed hosts before hosts with empty diff, otherwise wrong message is displayed and job lock is released too early. Add sync event immediately when device is unreachable after commit --- src/cnaas_nms/devicehandler/sync_devices.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 76546d33..1a4cd544 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -565,6 +565,7 @@ def push_sync_device( try: task.run(task=napalm_get, getters=["facts"], name="Verify reachability") except Exception as e: + add_sync_event(task.host.name, "commit_confirm_failed", scheduled_by, job_id) logger.error( "Could not reach device {} after commit, rollback in: {}s".format( task.host.name, api_settings.COMMIT_CONFIRMED_TIMEOUT @@ -976,11 +977,7 @@ def sync_devices( f"{total_change_score} is higher than auto-push limit {AUTOPUSH_MAX_SCORE}" ) elif get_confirm_mode(confirm_mode_override) == 2 and not dry_run: - if not changed_hosts: - logger.info("None of the selected host has any changes (diff), skipping commit-confirm") - logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) - Joblock.release_lock(session, job_id=job_id) - elif len(failed_hosts) > 0: + if len(failed_hosts) > 0: logger.error( "No confirm job scheduled since one or more devices failed in commitmode 2" ", all devices will rollback in {}s".format(api_settings.COMMIT_CONFIRMED_TIMEOUT) @@ -988,6 +985,10 @@ def sync_devices( time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT) logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) Joblock.release_lock(session, job_id=job_id) + elif not changed_hosts: + logger.info("None of the selected host has any changes (diff), skipping commit-confirm") + logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) + Joblock.release_lock(session, job_id=job_id) else: scheduler = Scheduler() next_job_id = scheduler.add_onetime_job( From 1c6f069e9b6ed4e922472e08976420bbfa2b4a61 Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Fri, 8 Sep 2023 16:17:28 +0200 Subject: [PATCH 07/11] Fix for missing log events, before events added between two xread calls would not have been read? --- src/cnaas_nms/run.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/cnaas_nms/run.py b/src/cnaas_nms/run.py index a83e4ede..55d8dbde 100644 --- a/src/cnaas_nms/run.py +++ b/src/cnaas_nms/run.py @@ -93,7 +93,7 @@ def loglevel_to_rooms(levelname: str) -> List[str]: def parse_redis_event(event): try: - # [stream, [(messageid, {datadict})] + # [stream, [(messageid, {datadict})]] if event[0] == "events": return event[1][0][1] except Exception: # noqa: S110 @@ -115,13 +115,19 @@ def emit_redis_event(event): def thread_websocket_events(): redis: StrictRedis with redis_session() as redis: + last_event = b"$" while True: - result = redis.xread({"events": b"$"}, count=10, block=200) + result = redis.xread({"events": last_event}, count=10, block=200) for item in result: event = parse_redis_event(item) if not event: continue emit_redis_event(event) + try: + # [stream, [(messageid, {datadict})]] + last_event = item[1][0][0] + except Exception: # noqa: S110 + last_event = b"$" if stop_websocket_threads: break From 39c98ad8722aa61308be77c05604b2d949243339 Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Mon, 11 Sep 2023 16:45:10 +0200 Subject: [PATCH 08/11] workaround for scheduled_by arg not passed to job wrapped functions --- src/cnaas_nms/scheduler/scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cnaas_nms/scheduler/scheduler.py b/src/cnaas_nms/scheduler/scheduler.py index fc9300f5..a671b242 100644 --- a/src/cnaas_nms/scheduler/scheduler.py +++ b/src/cnaas_nms/scheduler/scheduler.py @@ -212,6 +212,7 @@ def add_onetime_job( kwargs["job_id"] = job_id kwargs["scheduled_by"] = scheduled_by + kwargs["kwargs"]["scheduled_by"] = scheduled_by if self.use_mule: try: import uwsgi From 769276b9baa517b858cb3c34c555bb0bba84bea1 Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Mon, 11 Sep 2023 16:45:52 +0200 Subject: [PATCH 09/11] better error handling of napalm_configure fails --- src/cnaas_nms/devicehandler/sync_devices.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 1a4cd544..74b917c2 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -557,9 +557,14 @@ def push_sync_device( task_args["job_id"] = job_id task_args["confirm_mode_override"] = confirm_mode logger.debug("Commit confirm mode for host {}: {} (dry_run: {})".format(task.host.name, confirm_mode, dry_run)) - task.run(**task_args) - if confirm_mode != 2: - task.host.close_connection("napalm") + try: + task.run(**task_args) + except Exception as e: + logger.exception("Exception while running task napalm_configure for device {}".format(task.host.name)) + raise e + finally: + if confirm_mode != 2: + task.host.close_connection("napalm") if confirm_mode == 2 and not dry_run: time.sleep(api_settings.COMMIT_CONFIRMED_WAIT) try: @@ -874,6 +879,7 @@ def sync_devices( task=push_sync_device, dry_run=dry_run, job_id=job_id, + scheduled_by=scheduled_by, confirm_mode=get_confirm_mode(confirm_mode_override), ) except Exception as e: @@ -900,7 +906,7 @@ def sync_devices( unchanged_hosts = [] # calculate change impact score for host, results in nrresult.items(): - if host in failed_hosts or len(results) != 3: + if host in failed_hosts or len(results) < 3: logger.debug("Unable to calculate change score for failed device {}".format(host)) elif results[2].diff: changed_hosts.append(host) From 6343563bcabe0ff1deb13ba0486e8ecb967b0d37 Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Mon, 11 Sep 2023 17:09:47 +0200 Subject: [PATCH 10/11] wait to release device lock if one or more devices failed configuration with commit confirmed mode 1 --- src/cnaas_nms/devicehandler/sync_devices.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 74b917c2..3a1dbf5a 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -949,6 +949,12 @@ def sync_devices( dev.last_seen = datetime.datetime.utcnow() if not dry_run and get_confirm_mode(confirm_mode_override) != 2: logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) + if failed_hosts: + logger.error( + "One or more devices failed to commit configuration, they will roll back configuration" + " in {}s: {}".format(api_settings.COMMIT_CONFIRMED_TIMEOUT, ", ".join(failed_hosts)) + ) + time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT) Joblock.release_lock(session, job_id=job_id) if len(device_list) == 0: @@ -983,7 +989,7 @@ def sync_devices( f"{total_change_score} is higher than auto-push limit {AUTOPUSH_MAX_SCORE}" ) elif get_confirm_mode(confirm_mode_override) == 2 and not dry_run: - if len(failed_hosts) > 0: + if failed_hosts: logger.error( "No confirm job scheduled since one or more devices failed in commitmode 2" ", all devices will rollback in {}s".format(api_settings.COMMIT_CONFIRMED_TIMEOUT) From 33cacd7dee3ff59a9f6d9fc3ed4712d0fc059c88 Mon Sep 17 00:00:00 2001 From: Johan Marcusson Date: Mon, 11 Sep 2023 17:20:57 +0200 Subject: [PATCH 11/11] Fix the order of printing of log message to release job lock --- src/cnaas_nms/devicehandler/sync_devices.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 3a1dbf5a..58050d35 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -948,13 +948,13 @@ def sync_devices( remove_sync_events(hostname) dev.last_seen = datetime.datetime.utcnow() if not dry_run and get_confirm_mode(confirm_mode_override) != 2: - logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) if failed_hosts: logger.error( "One or more devices failed to commit configuration, they will roll back configuration" " in {}s: {}".format(api_settings.COMMIT_CONFIRMED_TIMEOUT, ", ".join(failed_hosts)) ) time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT) + logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) Joblock.release_lock(session, job_id=job_id) if len(device_list) == 0: