Skip to content

Commit

Permalink
Remove race with watchdog during backup, restore and update (#4635)
Browse files Browse the repository at this point in the history
* Remove race with watchdog during backup, restore and update

* Fix pylint issues and test

* Stop after image pull during update

* Add test for max failed attempts for plugin watchdog
  • Loading branch information
mdegat01 authored Oct 20, 2023
1 parent 010043f commit 37c1c89
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 119 deletions.
39 changes: 23 additions & 16 deletions supervisor/addons/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,33 +288,40 @@ async def update(
)

# Update instance
last_state: AddonState = addon.state
old_image = addon.image
# Cache data to prevent races with other updates to global
store = store.clone()

try:
await addon.instance.update(store.version, store.image)
except DockerError as err:
raise AddonsError() from err

_LOGGER.info("Add-on '%s' successfully updated", slug)
self.data.update(store)
# Stop the addon if running
if (last_state := addon.state) in {AddonState.STARTED, AddonState.STARTUP}:
await addon.stop()

# Cleanup
with suppress(DockerError):
await addon.instance.cleanup(
old_image=old_image, image=store.image, version=store.version
)
try:
_LOGGER.info("Add-on '%s' successfully updated", slug)
self.data.update(store)

# Setup/Fix AppArmor profile
await addon.install_apparmor()
# Cleanup
with suppress(DockerError):
await addon.instance.cleanup(
old_image=old_image, image=store.image, version=store.version
)

# restore state
return (
await addon.start()
if last_state in [AddonState.STARTED, AddonState.STARTUP]
else None
)
# Setup/Fix AppArmor profile
await addon.install_apparmor()

finally:
# restore state. Return awaitable for caller if no exception
out = (
await addon.start()
if last_state in {AddonState.STARTED, AddonState.STARTUP}
else None
)
return out

@Job(
name="addon_manager_rebuild",
Expand Down
112 changes: 56 additions & 56 deletions supervisor/addons/addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,10 +782,7 @@ async def begin_backup(self) -> bool:

if self.backup_mode == AddonBackupMode.COLD:
_LOGGER.info("Shutdown add-on %s for cold backup", self.slug)
try:
await self.instance.stop()
except DockerError as err:
raise AddonsError() from err
await self.stop()

elif self.backup_pre is not None:
await self._backup_command(self.backup_pre)
Expand Down Expand Up @@ -933,64 +930,67 @@ def _extract_tarfile():

# Stop it first if its running
if await self.instance.is_running():
with suppress(DockerError):
await self.instance.stop()

# Check version / restore image
version = data[ATTR_VERSION]
if not await self.instance.exists():
_LOGGER.info("Restore/Install of image for addon %s", self.slug)

image_file = Path(temp, "image.tar")
if image_file.is_file():
with suppress(DockerError):
await self.instance.import_image(image_file)
else:
with suppress(DockerError):
await self.instance.install(version, restore_image)
await self.instance.cleanup()
elif self.instance.version != version or self.legacy:
_LOGGER.info("Restore/Update of image for addon %s", self.slug)
with suppress(DockerError):
await self.instance.update(version, restore_image)

# Restore data
def _restore_data():
"""Restore data."""
temp_data = Path(temp, "data")
if temp_data.is_dir():
shutil.copytree(temp_data, self.path_data, symlinks=True)
else:
self.path_data.mkdir()
await self.stop()

_LOGGER.info("Restoring data for addon %s", self.slug)
if self.path_data.is_dir():
await remove_data(self.path_data)
try:
await self.sys_run_in_executor(_restore_data)
except shutil.Error as err:
raise AddonsError(
f"Can't restore origin data: {err}", _LOGGER.error
) from err
# Check version / restore image
version = data[ATTR_VERSION]
if not await self.instance.exists():
_LOGGER.info("Restore/Install of image for addon %s", self.slug)

image_file = Path(temp, "image.tar")
if image_file.is_file():
with suppress(DockerError):
await self.instance.import_image(image_file)
else:
with suppress(DockerError):
await self.instance.install(version, restore_image)
await self.instance.cleanup()
elif self.instance.version != version or self.legacy:
_LOGGER.info("Restore/Update of image for addon %s", self.slug)
with suppress(DockerError):
await self.instance.update(version, restore_image)

# Restore data
def _restore_data():
"""Restore data."""
temp_data = Path(temp, "data")
if temp_data.is_dir():
shutil.copytree(temp_data, self.path_data, symlinks=True)
else:
self.path_data.mkdir()

# Restore AppArmor
profile_file = Path(temp, "apparmor.txt")
if profile_file.exists():
_LOGGER.info("Restoring data for addon %s", self.slug)
if self.path_data.is_dir():
await remove_data(self.path_data)
try:
await self.sys_host.apparmor.load_profile(self.slug, profile_file)
except HostAppArmorError as err:
_LOGGER.error(
"Can't restore AppArmor profile for add-on %s", self.slug
)
raise AddonsError() from err
await self.sys_run_in_executor(_restore_data)
except shutil.Error as err:
raise AddonsError(
f"Can't restore origin data: {err}", _LOGGER.error
) from err

# Is add-on loaded
if not self.loaded:
await self.load()
# Restore AppArmor
profile_file = Path(temp, "apparmor.txt")
if profile_file.exists():
try:
await self.sys_host.apparmor.load_profile(
self.slug, profile_file
)
except HostAppArmorError as err:
_LOGGER.error(
"Can't restore AppArmor profile for add-on %s", self.slug
)
raise AddonsError() from err

# Is add-on loaded
if not self.loaded:
await self.load()

# Run add-on
if data[ATTR_STATE] == AddonState.STARTED:
wait_for_start = await self.start()
finally:
# Run add-on
if data[ATTR_STATE] == AddonState.STARTED:
wait_for_start = await self.start()

_LOGGER.info("Finished restore for add-on %s", self.slug)
return wait_for_start
Expand Down
4 changes: 0 additions & 4 deletions supervisor/docker/addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,10 +580,6 @@ async def update(
version, image=image, latest=latest, need_build=self.addon.latest_need_build
)

# Stop container & cleanup
with suppress(DockerError):
await self.stop()

@Job(
name="docker_addon_install",
limit=JobExecutionLimit.GROUP_ONCE,
Expand Down
11 changes: 2 additions & 9 deletions supervisor/plugins/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,7 @@ async def watchdog_container(self, event: DockerContainerStateEvent) -> None:
if not (event.name == self.instance.name):
return

if event.state in [
ContainerState.FAILED,
ContainerState.STOPPED,
ContainerState.UNHEALTHY,
]:
if event.state in {ContainerState.FAILED, ContainerState.UNHEALTHY}:
await self._restart_after_problem(event.state)

async def _restart_after_problem(self, state: ContainerState):
Expand All @@ -123,10 +119,7 @@ async def _restart_after_problem(self, state: ContainerState):
state,
)
try:
if state == ContainerState.STOPPED and attempts == 0:
await self.start()
else:
await self.rebuild()
await self.rebuild()
except PluginError as err:
attempts = attempts + 1
_LOGGER.error("Watchdog restart of %s plugin failed!", self.slug)
Expand Down
79 changes: 73 additions & 6 deletions tests/addons/test_addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,6 @@ def _fire_test_event(coresys: CoreSys, name: str, state: ContainerState):
)


async def mock_stop() -> None:
"""Mock for stop method."""


def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None:
"""Test options merge."""
addon = coresys.addons.get(TEST_ADDON_SLUG)
Expand Down Expand Up @@ -148,7 +144,7 @@ async def test_addon_watchdog(coresys: CoreSys, install_addon_ssh: Addon) -> Non

# Rebuild if it failed
current_state.return_value = ContainerState.FAILED
with patch.object(DockerAddon, "stop", return_value=mock_stop()) as stop:
with patch.object(DockerAddon, "stop") as stop:
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED)
await asyncio.sleep(0)
stop.assert_called_once_with(remove_container=True)
Expand Down Expand Up @@ -183,7 +179,7 @@ async def test_watchdog_on_stop(coresys: CoreSys, install_addon_ssh: Addon) -> N
DockerAddon,
"current_state",
return_value=ContainerState.STOPPED,
), patch.object(DockerAddon, "stop", return_value=mock_stop()):
), patch.object(DockerAddon, "stop"):
# Do not restart when addon stopped by user
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING)
await asyncio.sleep(0)
Expand Down Expand Up @@ -515,6 +511,42 @@ async def test_backup_cold_mode(
assert bool(start_task) is (status == "running")


async def test_backup_cold_mode_with_watchdog(
coresys: CoreSys,
install_addon_ssh: Addon,
container: MagicMock,
tmp_supervisor_data,
path_extern,
):
"""Test backing up an addon in cold mode with watchdog active."""
container.status = "running"
install_addon_ssh.watchdog = True
install_addon_ssh.path_data.mkdir()
await install_addon_ssh.load()

# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)

# Patching out the normal end of backup process leaves the container in a stopped state
# Watchdog should still not try to restart it though, it should remain this way
tarfile = SecureTarFile(coresys.config.path_tmp / "test.tar.gz", "w")
with patch.object(Addon, "start") as start, patch.object(
Addon, "restart"
) as restart, patch.object(Addon, "end_backup"), patch.object(
DockerAddon, "stop", new=mock_stop
), patch.object(
AddonModel,
"backup_mode",
new=PropertyMock(return_value=AddonBackupMode.COLD),
):
await install_addon_ssh.backup(tarfile)
await asyncio.sleep(0)
start.assert_not_called()
restart.assert_not_called()


@pytest.mark.parametrize("status", ["running", "stopped"])
async def test_restore(
coresys: CoreSys,
Expand Down Expand Up @@ -561,6 +593,41 @@ async def test_restore_while_running(
container.stop.assert_called_once()


async def test_restore_while_running_with_watchdog(
coresys: CoreSys,
install_addon_ssh: Addon,
container: MagicMock,
tmp_supervisor_data,
path_extern,
):
"""Test restore of a running addon with watchdog interference."""
container.status = "running"
coresys.hardware.disk.get_disk_free_space = lambda x: 5000
install_addon_ssh.path_data.mkdir()
install_addon_ssh.watchdog = True
await install_addon_ssh.load()

# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)

# We restore a stopped backup so restore will not restart it
# Watchdog will see it stop and should not attempt reanimation either
tarfile = SecureTarFile(get_fixture_path("backup_local_ssh_stopped.tar.gz"), "r")
with patch.object(Addon, "start") as start, patch.object(
Addon, "restart"
) as restart, patch.object(DockerAddon, "stop", new=mock_stop), patch.object(
CpuArch, "supported", new=PropertyMock(return_value=["aarch64"])
), patch.object(
Ingress, "update_hass_panel"
):
await coresys.addons.restore(TEST_ADDON_SLUG, tarfile)
await asyncio.sleep(0)
start.assert_not_called()
restart.assert_not_called()


async def test_start_when_running(
coresys: CoreSys,
install_addon_ssh: Addon,
Expand Down
37 changes: 37 additions & 0 deletions tests/addons/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,3 +404,40 @@ async def mock_update(_, version, image, *args, **kwargs):

assert install_addon_ssh.image == "test_image"
assert install_addon_ssh.version == AwesomeVersion("1.1.1")


async def test_watchdog_runs_during_update(
coresys: CoreSys, install_addon_ssh: Addon, container: MagicMock
):
"""Test watchdog running during a long update."""
container.status = "running"
install_addon_ssh.watchdog = True
coresys.store.data.addons["local_ssh"]["image"] = "test_image"
coresys.store.data.addons["local_ssh"]["version"] = AwesomeVersion("1.1.1")
await install_addon_ssh.load()

# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.STOPPED,
id="abc123",
time=1,
),
)

# Mock update to just wait and let other tasks run as if it is long running
async def mock_update(*args, **kwargs):
await asyncio.sleep(0)

# Start should be called exactly once by update itself. Restart should never be called
with patch.object(DockerAddon, "stop", new=mock_stop), patch.object(
DockerAddon, "update", new=mock_update
), patch.object(Addon, "start") as start, patch.object(Addon, "restart") as restart:
await coresys.addons.update("local_ssh")
await asyncio.sleep(0)
start.assert_called_once()
restart.assert_not_called()
Loading

0 comments on commit 37c1c89

Please sign in to comment.