Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix.confirmed 2 failed after push ok #317

Merged
merged 11 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/api/config/api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ mgmtdomain_reserved_count: 5
mgmtdomain_primary_ip_version: 4
commit_confirmed_mode: 1
commit_confirmed_timeout: 300
commit_confirmed_wait: 1
2 changes: 2 additions & 0 deletions docs/configuration/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ Defines parameters for the API:
(see :ref:`commit_confirm_modes`). Defaults to 1.
- commit_confirmed_timeout: Time to wait before rolling back an unconfirmed commit,
specified in seconds. Defaults to 300.
- commit_confirmed_wait: Time to wait between comitting configuration and checking
that the device is still reachable, specified in seconds. Defaults to 1.

/etc/cnaas-nms/repository.yml
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions src/cnaas_nms/app_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class ApiSettings(BaseSettings):
MGMTDOMAIN_PRIMARY_IP_VERSION: int = 4
COMMIT_CONFIRMED_MODE: int = 1
COMMIT_CONFIRMED_TIMEOUT: int = 300
COMMIT_CONFIRMED_WAIT: int = 1
SETTINGS_OVERRIDE: Optional[dict] = None

@validator("MGMTDOMAIN_PRIMARY_IP_VERSION")
Expand Down Expand Up @@ -90,6 +91,7 @@ def construct_api_settings() -> ApiSettings:
MGMTDOMAIN_PRIMARY_IP_VERSION=config.get("mgmtdomain_primary_ip_version", 4),
COMMIT_CONFIRMED_MODE=config.get("commit_confirmed_mode", 1),
COMMIT_CONFIRMED_TIMEOUT=config.get("commit_confirmed_timeout", 300),
COMMIT_CONFIRMED_WAIT=config.get("commit_confirmed_wait", 1),
SETTINGS_OVERRIDE=config.get("settings_override", None),
)
else:
Expand Down
49 changes: 40 additions & 9 deletions src/cnaas_nms/devicehandler/sync_devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,33 @@ def push_sync_device(
task_args["job_id"] = job_id
task_args["confirm_mode_override"] = confirm_mode
logger.debug("Commit confirm mode for host {}: {} (dry_run: {})".format(task.host.name, confirm_mode, dry_run))
task.run(**task_args)
if confirm_mode != 2:
task.host.close_connection("napalm")
try:
task.run(**task_args)
except Exception as e:
logger.exception("Exception while running task napalm_configure for device {}".format(task.host.name))
raise e
finally:
if confirm_mode != 2:
task.host.close_connection("napalm")
if confirm_mode == 2 and not dry_run:
time.sleep(api_settings.COMMIT_CONFIRMED_WAIT)
try:
task.run(task=napalm_get, getters=["facts"], name="Verify reachability")
except Exception as e:
add_sync_event(task.host.name, "commit_confirm_failed", scheduled_by, job_id)
logger.error(
"Could not reach device {} after commit, rollback in: {}s".format(
task.host.name, api_settings.COMMIT_CONFIRMED_TIMEOUT
)
)
raise e
else:
short_facts = {"fqdn": "unknown"}
try:
short_facts["fqdn"] = task.results[2].result["facts"]["fqdn"]
task.results[2].result["facts"] = short_facts
except Exception:
pass

if task.results[1].diff:
config = task.results[1].host["config"]
Expand Down Expand Up @@ -855,6 +879,7 @@ def sync_devices(
task=push_sync_device,
dry_run=dry_run,
job_id=job_id,
scheduled_by=scheduled_by,
confirm_mode=get_confirm_mode(confirm_mode_override),
)
except Exception as e:
Expand All @@ -881,7 +906,7 @@ def sync_devices(
unchanged_hosts = []
# calculate change impact score
for host, results in nrresult.items():
if host in failed_hosts or len(results) != 3:
if host in failed_hosts or len(results) < 3:
logger.debug("Unable to calculate change score for failed device {}".format(host))
elif results[2].diff:
changed_hosts.append(host)
Expand Down Expand Up @@ -923,6 +948,12 @@ def sync_devices(
remove_sync_events(hostname)
dev.last_seen = datetime.datetime.utcnow()
if not dry_run and get_confirm_mode(confirm_mode_override) != 2:
if failed_hosts:
logger.error(
"One or more devices failed to commit configuration, they will roll back configuration"
" in {}s: {}".format(api_settings.COMMIT_CONFIRMED_TIMEOUT, ", ".join(failed_hosts))
)
time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT)
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)

Expand Down Expand Up @@ -958,18 +989,18 @@ def sync_devices(
f"{total_change_score} is higher than auto-push limit {AUTOPUSH_MAX_SCORE}"
)
elif get_confirm_mode(confirm_mode_override) == 2 and not dry_run:
if not changed_hosts:
logger.info("None of the selected host has any changes (diff), skipping commit-confirm")
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)
elif len(failed_hosts) > 0:
if failed_hosts:
logger.error(
"No confirm job scheduled since one or more devices failed in commitmode 2"
", all devices will rollback in {}s".format(api_settings.COMMIT_CONFIRMED_TIMEOUT)
)
time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT)
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)
elif not changed_hosts:
logger.info("None of the selected host has any changes (diff), skipping commit-confirm")
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)
else:
scheduler = Scheduler()
next_job_id = scheduler.add_onetime_job(
Expand Down
10 changes: 8 additions & 2 deletions src/cnaas_nms/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def loglevel_to_rooms(levelname: str) -> List[str]:

def parse_redis_event(event):
try:
# [stream, [(messageid, {datadict})]
# [stream, [(messageid, {datadict})]]
if event[0] == "events":
return event[1][0][1]
except Exception: # noqa: S110
Expand All @@ -115,13 +115,19 @@ def emit_redis_event(event):
def thread_websocket_events():
redis: StrictRedis
with redis_session() as redis:
last_event = b"$"
while True:
result = redis.xread({"events": b"$"}, count=10, block=200)
result = redis.xread({"events": last_event}, count=10, block=200)
for item in result:
event = parse_redis_event(item)
if not event:
continue
emit_redis_event(event)
try:
# [stream, [(messageid, {datadict})]]
last_event = item[1][0][0]
except Exception: # noqa: S110
last_event = b"$"
if stop_websocket_threads:
break

Expand Down
1 change: 1 addition & 0 deletions src/cnaas_nms/scheduler/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ def add_onetime_job(

kwargs["job_id"] = job_id
kwargs["scheduled_by"] = scheduled_by
kwargs["kwargs"]["scheduled_by"] = scheduled_by
if self.use_mule:
try:
import uwsgi
Expand Down