Skip to content

Commit

Permalink
Add initial attempts to automatically clean up dangling hosts
Browse files Browse the repository at this point in the history
Now, when workflows fail, Broker will attempt to find a handgling host
and check it in if found.
  • Loading branch information
JacobCallahan committed Nov 26, 2024
1 parent f5b68dc commit 8c7ce23
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 6 deletions.
2 changes: 1 addition & 1 deletion broker/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def _get_migrations(self, force_version=None):
migrations = []
for _, name, _ in pkgutil.iter_modules(config_migrations.__path__):
module = importlib.import_module(f"broker.config_migrations.{name}")
if hasattr(module, "run_migrations"):
if hasattr(module, "run_migrations") and "example" not in module.__name__:
if force_version and force_version == file_name_to_ver(name):
migrations.append(module)
break
Expand Down
25 changes: 25 additions & 0 deletions broker/config_migrations/example_migration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Config migrations for versions older than 0.6.1 to 0.6.1.
Copy this file to a new file in the same directory and modify it to create a new migration.
The new file must be named `vX_Y_Z.py` where X_Y_Z is the version you are migrating to.
e.g. cp example_migration.py v0_6_1.py
"""

from logzero import logger

TO_VERSION = "0.6.1"


def example_migration(config_dict):
"""Migrations should modify the config_dict in place and return it."""
config_dict["example_key"] = "example_value"
return config_dict


def run_migrations(config_dict):
"""Run all migrations."""
logger.info(f"Running config migrations for {TO_VERSION}.")
config_dict = example_migration(config_dict)
config_dict["_version"] = TO_VERSION
return config_dict
22 changes: 22 additions & 0 deletions broker/config_migrations/v0_6_3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Config migrations for versions older than 0.6.3 to 0.6.3."""

from logzero import logger

TO_VERSION = "0.6.3"


def add_dangling_behavior(config_dict):
"""Add the dangling_behavior config to AnsibleTower."""
if "AnsibleTower" in config_dict:
if "dangling_behavior" not in config_dict["AnsibleTower"]:
logger.debug("Adding dangling_behavior to AnsibleTower.")
config_dict["AnsibleTower"]["dangling_behavior"] = "checkin"
return config_dict


def run_migrations(config_dict):
"""Run all migrations."""
logger.info(f"Running config migrations for {TO_VERSION}.")
config_dict = add_dangling_behavior(config_dict)
config_dict["_version"] = TO_VERSION
return config_dict
64 changes: 59 additions & 5 deletions broker/providers/ansible_tower.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
import click
from dynaconf import Validator
from logzero import logger
from rich.prompt import Prompt

from broker import exceptions
from broker.helpers import eval_filter, find_origin
from broker.helpers import eval_filter, find_origin, update_inventory
from broker.settings import settings

try:
Expand All @@ -23,7 +24,7 @@


def convert_pseudonamespaces(attr_dict):
"""Recursively convert PsuedoNamespace objects into dictionaries."""
"""Recursively convert PseudoNamespace objects into dictionaries."""
out_dict = {}
for key, value in attr_dict.items():
if isinstance(value, awxkit.utils.PseudoNamespace):
Expand Down Expand Up @@ -121,6 +122,7 @@ class AnsibleTower(Provider):
| Validator("ANSIBLETOWER.token", must_exist=True)
),
Validator("ANSIBLETOWER.inventory", default=None),
Validator("ANSIBLETOWER.dangling_behavior", default="checkin"),
]

_checkout_options = [
Expand Down Expand Up @@ -164,11 +166,12 @@ def __init__(self, **kwargs):
self.uname = settings.ANSIBLETOWER.get("username")
self.pword = settings.ANSIBLETOWER.get("password")
self.token = settings.ANSIBLETOWER.get("token")
self.dangling_behavior = settings.ANSIBLETOWER.get("dangling_behavior")
self._inventory = kwargs.get("tower_inventory") or settings.ANSIBLETOWER.inventory
# Init the class itself
config = kwargs.get("config")
root = kwargs.get("root")
self._v2, self.username = get_awxkit_and_uname(
self._v2, self.uname = get_awxkit_and_uname(
config=config,
root=root,
url=self.url,
Expand Down Expand Up @@ -374,6 +377,54 @@ def _get_failure_messages(self, workflow):
else:
return failure_messages

def _try_get_dangling_hosts(self, failed_workflow):
"""Get one or more hosts that may have been left behind by a failed workflow."""
hosts = []
for node in failed_workflow.get_related("workflow_nodes").results:
if not (job_fields := node.summary_fields.get("job", {})) or job_fields.get(
"failed"
): # skip jobs with no summary fields and failed jobs
continue
if jobs := self._v2.jobs.get(id=job_fields["id"]).results:
if vm_name := jobs[0].artifacts.get("vm_name"):
hosts.append(vm_name)
return list(set(hosts))

def handle_dangling_hosts(self, job):
"""Attempt to check in dangling hosts associated with the given job."""
dangling_hosts = self._try_get_dangling_hosts(job)
if not dangling_hosts:
logger.debug("No dangling hosts found for the failed job.")
return
dangling_behavior = self.dangling_behavior
for dangling_host in dangling_hosts:
logger.info(f"Found dangling host: {dangling_host}")
if dangling_behavior == "prompt":
choice = Prompt.ask(
"What would you like to do with this host? [c/s/cA/sA]\n",
"Checkin (c), Store (s), Checkin All (cA), Store All (sA)",
choices=["c", "s", "cA", "sA"],
)
if choice == "cA":
dangling_behavior = "checkin"
elif choice == "sA":
dangling_behavior = "store"
else:
choice = None
# handle checkins
if choice == "c" or dangling_behavior == "checkin":
try:
self.release(dangling_host)
logger.debug(f"Successfully checked in dangling host: {dangling_host}")
except exceptions.BrokerError:
logger.warning(f"Failed to check in dangling host: {dangling_host}")
elif choice == "s" or dangling_behavior == "store":
logger.debug(f"Storing dangling host: {dangling_host}")
host = self._v2.hosts.get(name=dangling_host).results[0]
host = self._compile_host_info(host)
host["failed"] = True
update_inventory(add=host)

def _compile_host_info(self, host):
try:
host_facts = host.related.ansible_facts.get()
Expand Down Expand Up @@ -601,20 +652,23 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor
logger.info(f"Waiting for job: \nAPI: {job_api_url}\nUI: {job_ui_url}")
job.wait_until_completed(timeout=settings.ANSIBLETOWER.workflow_timeout)
if job.status != "successful":
failure_message = self._get_failure_messages(job)
message_data = {
f"{subject.capitalize()} Status": job.status,
"Reason(s)": self._get_failure_messages(job),
"Reason(s)": failure_message,
"URL": job_ui_url,
}
helpers.emit(message_data)
if "was automatically checked-in" not in failure_message:
self.handle_dangling_hosts(job)
raise JobExecutionError(message_data=message_data["Reason(s)"])
if strategy := kwargs.pop("artifacts", None):
return self._merge_artifacts(job, strategy=strategy)
return job

def get_inventory(self, user=None):
"""Compile a list of hosts based on any inventory a user's name is mentioned."""
user = user or self.username
user = user or self.uname
invs = [
inv
for inv in self._v2.inventory.get(page_size=200).results
Expand Down

0 comments on commit 8c7ce23

Please sign in to comment.