diff --git a/docker/api/config/api.yml b/docker/api/config/api.yml index 32a87fcd..9f52fcf6 100644 --- a/docker/api/config/api.yml +++ b/docker/api/config/api.yml @@ -12,3 +12,4 @@ mgmtdomain_reserved_count: 5 mgmtdomain_primary_ip_version: 4 commit_confirmed_mode: 1 commit_confirmed_timeout: 300 +commit_confirmed_wait: 1 diff --git a/docs/configuration/index.rst b/docs/configuration/index.rst index 6221622b..4ca4dcc4 100644 --- a/docs/configuration/index.rst +++ b/docs/configuration/index.rst @@ -45,6 +45,8 @@ Defines parameters for the API: (see :ref:`commit_confirm_modes`). Defaults to 1. - commit_confirmed_timeout: Time to wait before rolling back an unconfirmed commit, specified in seconds. Defaults to 300. +- commit_confirmed_wait: Time to wait between comitting configuration and checking + that the device is still reachable, specified in seconds. Defaults to 1. /etc/cnaas-nms/repository.yml ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/cnaas_nms/app_settings.py b/src/cnaas_nms/app_settings.py index 4d1622b2..cbc77621 100644 --- a/src/cnaas_nms/app_settings.py +++ b/src/cnaas_nms/app_settings.py @@ -52,6 +52,7 @@ class ApiSettings(BaseSettings): MGMTDOMAIN_PRIMARY_IP_VERSION: int = 4 COMMIT_CONFIRMED_MODE: int = 1 COMMIT_CONFIRMED_TIMEOUT: int = 300 + COMMIT_CONFIRMED_WAIT: int = 1 SETTINGS_OVERRIDE: Optional[dict] = None @validator("MGMTDOMAIN_PRIMARY_IP_VERSION") @@ -90,6 +91,7 @@ def construct_api_settings() -> ApiSettings: MGMTDOMAIN_PRIMARY_IP_VERSION=config.get("mgmtdomain_primary_ip_version", 4), COMMIT_CONFIRMED_MODE=config.get("commit_confirmed_mode", 1), COMMIT_CONFIRMED_TIMEOUT=config.get("commit_confirmed_timeout", 300), + COMMIT_CONFIRMED_WAIT=config.get("commit_confirmed_wait", 1), SETTINGS_OVERRIDE=config.get("settings_override", None), ) else: diff --git a/src/cnaas_nms/devicehandler/sync_devices.py b/src/cnaas_nms/devicehandler/sync_devices.py index 8c4819b8..58050d35 100644 --- a/src/cnaas_nms/devicehandler/sync_devices.py +++ b/src/cnaas_nms/devicehandler/sync_devices.py @@ -557,9 +557,33 @@ def push_sync_device( task_args["job_id"] = job_id task_args["confirm_mode_override"] = confirm_mode logger.debug("Commit confirm mode for host {}: {} (dry_run: {})".format(task.host.name, confirm_mode, dry_run)) - task.run(**task_args) - if confirm_mode != 2: - task.host.close_connection("napalm") + try: + task.run(**task_args) + except Exception as e: + logger.exception("Exception while running task napalm_configure for device {}".format(task.host.name)) + raise e + finally: + if confirm_mode != 2: + task.host.close_connection("napalm") + if confirm_mode == 2 and not dry_run: + time.sleep(api_settings.COMMIT_CONFIRMED_WAIT) + try: + task.run(task=napalm_get, getters=["facts"], name="Verify reachability") + except Exception as e: + add_sync_event(task.host.name, "commit_confirm_failed", scheduled_by, job_id) + logger.error( + "Could not reach device {} after commit, rollback in: {}s".format( + task.host.name, api_settings.COMMIT_CONFIRMED_TIMEOUT + ) + ) + raise e + else: + short_facts = {"fqdn": "unknown"} + try: + short_facts["fqdn"] = task.results[2].result["facts"]["fqdn"] + task.results[2].result["facts"] = short_facts + except Exception: + pass if task.results[1].diff: config = task.results[1].host["config"] @@ -855,6 +879,7 @@ def sync_devices( task=push_sync_device, dry_run=dry_run, job_id=job_id, + scheduled_by=scheduled_by, confirm_mode=get_confirm_mode(confirm_mode_override), ) except Exception as e: @@ -881,7 +906,7 @@ def sync_devices( unchanged_hosts = [] # calculate change impact score for host, results in nrresult.items(): - if host in failed_hosts or len(results) != 3: + if host in failed_hosts or len(results) < 3: logger.debug("Unable to calculate change score for failed device {}".format(host)) elif results[2].diff: changed_hosts.append(host) @@ -923,6 +948,12 @@ def sync_devices( remove_sync_events(hostname) dev.last_seen = datetime.datetime.utcnow() if not dry_run and get_confirm_mode(confirm_mode_override) != 2: + if failed_hosts: + logger.error( + "One or more devices failed to commit configuration, they will roll back configuration" + " in {}s: {}".format(api_settings.COMMIT_CONFIRMED_TIMEOUT, ", ".join(failed_hosts)) + ) + time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT) logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) Joblock.release_lock(session, job_id=job_id) @@ -958,11 +989,7 @@ def sync_devices( f"{total_change_score} is higher than auto-push limit {AUTOPUSH_MAX_SCORE}" ) elif get_confirm_mode(confirm_mode_override) == 2 and not dry_run: - if not changed_hosts: - logger.info("None of the selected host has any changes (diff), skipping commit-confirm") - logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) - Joblock.release_lock(session, job_id=job_id) - elif len(failed_hosts) > 0: + if failed_hosts: logger.error( "No confirm job scheduled since one or more devices failed in commitmode 2" ", all devices will rollback in {}s".format(api_settings.COMMIT_CONFIRMED_TIMEOUT) @@ -970,6 +997,10 @@ def sync_devices( time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT) logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) Joblock.release_lock(session, job_id=job_id) + elif not changed_hosts: + logger.info("None of the selected host has any changes (diff), skipping commit-confirm") + logger.info("Releasing lock for devices from syncto job: {}".format(job_id)) + Joblock.release_lock(session, job_id=job_id) else: scheduler = Scheduler() next_job_id = scheduler.add_onetime_job( diff --git a/src/cnaas_nms/run.py b/src/cnaas_nms/run.py index a83e4ede..55d8dbde 100644 --- a/src/cnaas_nms/run.py +++ b/src/cnaas_nms/run.py @@ -93,7 +93,7 @@ def loglevel_to_rooms(levelname: str) -> List[str]: def parse_redis_event(event): try: - # [stream, [(messageid, {datadict})] + # [stream, [(messageid, {datadict})]] if event[0] == "events": return event[1][0][1] except Exception: # noqa: S110 @@ -115,13 +115,19 @@ def emit_redis_event(event): def thread_websocket_events(): redis: StrictRedis with redis_session() as redis: + last_event = b"$" while True: - result = redis.xread({"events": b"$"}, count=10, block=200) + result = redis.xread({"events": last_event}, count=10, block=200) for item in result: event = parse_redis_event(item) if not event: continue emit_redis_event(event) + try: + # [stream, [(messageid, {datadict})]] + last_event = item[1][0][0] + except Exception: # noqa: S110 + last_event = b"$" if stop_websocket_threads: break diff --git a/src/cnaas_nms/scheduler/scheduler.py b/src/cnaas_nms/scheduler/scheduler.py index fc9300f5..a671b242 100644 --- a/src/cnaas_nms/scheduler/scheduler.py +++ b/src/cnaas_nms/scheduler/scheduler.py @@ -212,6 +212,7 @@ def add_onetime_job( kwargs["job_id"] = job_id kwargs["scheduled_by"] = scheduled_by + kwargs["kwargs"]["scheduled_by"] = scheduled_by if self.use_mule: try: import uwsgi