diff --git a/supervisor/addons/__init__.py b/supervisor/addons/__init__.py index 4c615e69bcc..606bc7d97a3 100644 --- a/supervisor/addons/__init__.py +++ b/supervisor/addons/__init__.py @@ -288,33 +288,40 @@ async def update( ) # Update instance - last_state: AddonState = addon.state old_image = addon.image # Cache data to prevent races with other updates to global store = store.clone() + try: await addon.instance.update(store.version, store.image) except DockerError as err: raise AddonsError() from err - _LOGGER.info("Add-on '%s' successfully updated", slug) - self.data.update(store) + # Stop the addon if running + if (last_state := addon.state) in {AddonState.STARTED, AddonState.STARTUP}: + await addon.stop() - # Cleanup - with suppress(DockerError): - await addon.instance.cleanup( - old_image=old_image, image=store.image, version=store.version - ) + try: + _LOGGER.info("Add-on '%s' successfully updated", slug) + self.data.update(store) - # Setup/Fix AppArmor profile - await addon.install_apparmor() + # Cleanup + with suppress(DockerError): + await addon.instance.cleanup( + old_image=old_image, image=store.image, version=store.version + ) - # restore state - return ( - await addon.start() - if last_state in [AddonState.STARTED, AddonState.STARTUP] - else None - ) + # Setup/Fix AppArmor profile + await addon.install_apparmor() + + finally: + # restore state. Return awaitable for caller if no exception + out = ( + await addon.start() + if last_state in {AddonState.STARTED, AddonState.STARTUP} + else None + ) + return out @Job( name="addon_manager_rebuild", diff --git a/supervisor/addons/addon.py b/supervisor/addons/addon.py index 07b0dd82b20..be434d0f3be 100644 --- a/supervisor/addons/addon.py +++ b/supervisor/addons/addon.py @@ -782,10 +782,7 @@ async def begin_backup(self) -> bool: if self.backup_mode == AddonBackupMode.COLD: _LOGGER.info("Shutdown add-on %s for cold backup", self.slug) - try: - await self.instance.stop() - except DockerError as err: - raise AddonsError() from err + await self.stop() elif self.backup_pre is not None: await self._backup_command(self.backup_pre) @@ -933,64 +930,67 @@ def _extract_tarfile(): # Stop it first if its running if await self.instance.is_running(): - with suppress(DockerError): - await self.instance.stop() - - # Check version / restore image - version = data[ATTR_VERSION] - if not await self.instance.exists(): - _LOGGER.info("Restore/Install of image for addon %s", self.slug) - - image_file = Path(temp, "image.tar") - if image_file.is_file(): - with suppress(DockerError): - await self.instance.import_image(image_file) - else: - with suppress(DockerError): - await self.instance.install(version, restore_image) - await self.instance.cleanup() - elif self.instance.version != version or self.legacy: - _LOGGER.info("Restore/Update of image for addon %s", self.slug) - with suppress(DockerError): - await self.instance.update(version, restore_image) - - # Restore data - def _restore_data(): - """Restore data.""" - temp_data = Path(temp, "data") - if temp_data.is_dir(): - shutil.copytree(temp_data, self.path_data, symlinks=True) - else: - self.path_data.mkdir() + await self.stop() - _LOGGER.info("Restoring data for addon %s", self.slug) - if self.path_data.is_dir(): - await remove_data(self.path_data) try: - await self.sys_run_in_executor(_restore_data) - except shutil.Error as err: - raise AddonsError( - f"Can't restore origin data: {err}", _LOGGER.error - ) from err + # Check version / restore image + version = data[ATTR_VERSION] + if not await self.instance.exists(): + _LOGGER.info("Restore/Install of image for addon %s", self.slug) + + image_file = Path(temp, "image.tar") + if image_file.is_file(): + with suppress(DockerError): + await self.instance.import_image(image_file) + else: + with suppress(DockerError): + await self.instance.install(version, restore_image) + await self.instance.cleanup() + elif self.instance.version != version or self.legacy: + _LOGGER.info("Restore/Update of image for addon %s", self.slug) + with suppress(DockerError): + await self.instance.update(version, restore_image) + + # Restore data + def _restore_data(): + """Restore data.""" + temp_data = Path(temp, "data") + if temp_data.is_dir(): + shutil.copytree(temp_data, self.path_data, symlinks=True) + else: + self.path_data.mkdir() - # Restore AppArmor - profile_file = Path(temp, "apparmor.txt") - if profile_file.exists(): + _LOGGER.info("Restoring data for addon %s", self.slug) + if self.path_data.is_dir(): + await remove_data(self.path_data) try: - await self.sys_host.apparmor.load_profile(self.slug, profile_file) - except HostAppArmorError as err: - _LOGGER.error( - "Can't restore AppArmor profile for add-on %s", self.slug - ) - raise AddonsError() from err + await self.sys_run_in_executor(_restore_data) + except shutil.Error as err: + raise AddonsError( + f"Can't restore origin data: {err}", _LOGGER.error + ) from err - # Is add-on loaded - if not self.loaded: - await self.load() + # Restore AppArmor + profile_file = Path(temp, "apparmor.txt") + if profile_file.exists(): + try: + await self.sys_host.apparmor.load_profile( + self.slug, profile_file + ) + except HostAppArmorError as err: + _LOGGER.error( + "Can't restore AppArmor profile for add-on %s", self.slug + ) + raise AddonsError() from err + + # Is add-on loaded + if not self.loaded: + await self.load() - # Run add-on - if data[ATTR_STATE] == AddonState.STARTED: - wait_for_start = await self.start() + finally: + # Run add-on + if data[ATTR_STATE] == AddonState.STARTED: + wait_for_start = await self.start() _LOGGER.info("Finished restore for add-on %s", self.slug) return wait_for_start diff --git a/supervisor/docker/addon.py b/supervisor/docker/addon.py index 009fa549b84..3520a180a0b 100644 --- a/supervisor/docker/addon.py +++ b/supervisor/docker/addon.py @@ -580,10 +580,6 @@ async def update( version, image=image, latest=latest, need_build=self.addon.latest_need_build ) - # Stop container & cleanup - with suppress(DockerError): - await self.stop() - @Job( name="docker_addon_install", limit=JobExecutionLimit.GROUP_ONCE, diff --git a/supervisor/plugins/base.py b/supervisor/plugins/base.py index a32805f8d35..2591747139d 100644 --- a/supervisor/plugins/base.py +++ b/supervisor/plugins/base.py @@ -105,11 +105,7 @@ async def watchdog_container(self, event: DockerContainerStateEvent) -> None: if not (event.name == self.instance.name): return - if event.state in [ - ContainerState.FAILED, - ContainerState.STOPPED, - ContainerState.UNHEALTHY, - ]: + if event.state in {ContainerState.FAILED, ContainerState.UNHEALTHY}: await self._restart_after_problem(event.state) async def _restart_after_problem(self, state: ContainerState): @@ -123,10 +119,7 @@ async def _restart_after_problem(self, state: ContainerState): state, ) try: - if state == ContainerState.STOPPED and attempts == 0: - await self.start() - else: - await self.rebuild() + await self.rebuild() except PluginError as err: attempts = attempts + 1 _LOGGER.error("Watchdog restart of %s plugin failed!", self.slug) diff --git a/tests/addons/test_addon.py b/tests/addons/test_addon.py index 0b9a7ea4fab..647d8a91232 100644 --- a/tests/addons/test_addon.py +++ b/tests/addons/test_addon.py @@ -39,10 +39,6 @@ def _fire_test_event(coresys: CoreSys, name: str, state: ContainerState): ) -async def mock_stop() -> None: - """Mock for stop method.""" - - def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None: """Test options merge.""" addon = coresys.addons.get(TEST_ADDON_SLUG) @@ -148,7 +144,7 @@ async def test_addon_watchdog(coresys: CoreSys, install_addon_ssh: Addon) -> Non # Rebuild if it failed current_state.return_value = ContainerState.FAILED - with patch.object(DockerAddon, "stop", return_value=mock_stop()) as stop: + with patch.object(DockerAddon, "stop") as stop: _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED) await asyncio.sleep(0) stop.assert_called_once_with(remove_container=True) @@ -183,7 +179,7 @@ async def test_watchdog_on_stop(coresys: CoreSys, install_addon_ssh: Addon) -> N DockerAddon, "current_state", return_value=ContainerState.STOPPED, - ), patch.object(DockerAddon, "stop", return_value=mock_stop()): + ), patch.object(DockerAddon, "stop"): # Do not restart when addon stopped by user _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING) await asyncio.sleep(0) @@ -515,6 +511,42 @@ async def test_backup_cold_mode( assert bool(start_task) is (status == "running") +async def test_backup_cold_mode_with_watchdog( + coresys: CoreSys, + install_addon_ssh: Addon, + container: MagicMock, + tmp_supervisor_data, + path_extern, +): + """Test backing up an addon in cold mode with watchdog active.""" + container.status = "running" + install_addon_ssh.watchdog = True + install_addon_ssh.path_data.mkdir() + await install_addon_ssh.load() + + # Simulate stop firing the docker event for stopped container like it normally would + async def mock_stop(*args, **kwargs): + container.status = "stopped" + _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED) + + # Patching out the normal end of backup process leaves the container in a stopped state + # Watchdog should still not try to restart it though, it should remain this way + tarfile = SecureTarFile(coresys.config.path_tmp / "test.tar.gz", "w") + with patch.object(Addon, "start") as start, patch.object( + Addon, "restart" + ) as restart, patch.object(Addon, "end_backup"), patch.object( + DockerAddon, "stop", new=mock_stop + ), patch.object( + AddonModel, + "backup_mode", + new=PropertyMock(return_value=AddonBackupMode.COLD), + ): + await install_addon_ssh.backup(tarfile) + await asyncio.sleep(0) + start.assert_not_called() + restart.assert_not_called() + + @pytest.mark.parametrize("status", ["running", "stopped"]) async def test_restore( coresys: CoreSys, @@ -561,6 +593,41 @@ async def test_restore_while_running( container.stop.assert_called_once() +async def test_restore_while_running_with_watchdog( + coresys: CoreSys, + install_addon_ssh: Addon, + container: MagicMock, + tmp_supervisor_data, + path_extern, +): + """Test restore of a running addon with watchdog interference.""" + container.status = "running" + coresys.hardware.disk.get_disk_free_space = lambda x: 5000 + install_addon_ssh.path_data.mkdir() + install_addon_ssh.watchdog = True + await install_addon_ssh.load() + + # Simulate stop firing the docker event for stopped container like it normally would + async def mock_stop(*args, **kwargs): + container.status = "stopped" + _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED) + + # We restore a stopped backup so restore will not restart it + # Watchdog will see it stop and should not attempt reanimation either + tarfile = SecureTarFile(get_fixture_path("backup_local_ssh_stopped.tar.gz"), "r") + with patch.object(Addon, "start") as start, patch.object( + Addon, "restart" + ) as restart, patch.object(DockerAddon, "stop", new=mock_stop), patch.object( + CpuArch, "supported", new=PropertyMock(return_value=["aarch64"]) + ), patch.object( + Ingress, "update_hass_panel" + ): + await coresys.addons.restore(TEST_ADDON_SLUG, tarfile) + await asyncio.sleep(0) + start.assert_not_called() + restart.assert_not_called() + + async def test_start_when_running( coresys: CoreSys, install_addon_ssh: Addon, diff --git a/tests/addons/test_manager.py b/tests/addons/test_manager.py index 2dbf30849a5..ea55e66f7d3 100644 --- a/tests/addons/test_manager.py +++ b/tests/addons/test_manager.py @@ -404,3 +404,40 @@ async def mock_update(_, version, image, *args, **kwargs): assert install_addon_ssh.image == "test_image" assert install_addon_ssh.version == AwesomeVersion("1.1.1") + + +async def test_watchdog_runs_during_update( + coresys: CoreSys, install_addon_ssh: Addon, container: MagicMock +): + """Test watchdog running during a long update.""" + container.status = "running" + install_addon_ssh.watchdog = True + coresys.store.data.addons["local_ssh"]["image"] = "test_image" + coresys.store.data.addons["local_ssh"]["version"] = AwesomeVersion("1.1.1") + await install_addon_ssh.load() + + # Simulate stop firing the docker event for stopped container like it normally would + async def mock_stop(*args, **kwargs): + container.status = "stopped" + coresys.bus.fire_event( + BusEvent.DOCKER_CONTAINER_STATE_CHANGE, + DockerContainerStateEvent( + name=f"addon_{TEST_ADDON_SLUG}", + state=ContainerState.STOPPED, + id="abc123", + time=1, + ), + ) + + # Mock update to just wait and let other tasks run as if it is long running + async def mock_update(*args, **kwargs): + await asyncio.sleep(0) + + # Start should be called exactly once by update itself. Restart should never be called + with patch.object(DockerAddon, "stop", new=mock_stop), patch.object( + DockerAddon, "update", new=mock_update + ), patch.object(Addon, "start") as start, patch.object(Addon, "restart") as restart: + await coresys.addons.update("local_ssh") + await asyncio.sleep(0) + start.assert_called_once() + restart.assert_not_called() diff --git a/tests/plugins/test_plugin_base.py b/tests/plugins/test_plugin_base.py index 0f2400e569a..1e5a7abff8d 100644 --- a/tests/plugins/test_plugin_base.py +++ b/tests/plugins/test_plugin_base.py @@ -1,6 +1,6 @@ """Test base plugin functionality.""" import asyncio -from unittest.mock import Mock, PropertyMock, patch +from unittest.mock import MagicMock, Mock, PropertyMock, patch from awesomeversion import AwesomeVersion import pytest @@ -98,7 +98,7 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None: start.assert_not_called() rebuild.reset_mock() - # Plugins are restarted anytime they stop, not just on failure + # Stop should be ignored as it means an update or system shutdown, plugins don't stop otherwise current_state.return_value = ContainerState.STOPPED coresys.bus.fire_event( BusEvent.DOCKER_CONTAINER_STATE_CHANGE, @@ -111,9 +111,8 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None: ) await asyncio.sleep(0) rebuild.assert_not_called() - start.assert_called_once() + start.assert_not_called() - start.reset_mock() # Do not process event if container state has changed since fired current_state.return_value = ContainerState.HEALTHY coresys.bus.fire_event( @@ -155,41 +154,38 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None: ], indirect=["plugin"], ) -async def test_plugin_watchdog_rebuild_on_failure( - coresys: CoreSys, capture_exception: Mock, plugin: PluginBase, error: PluginError +async def test_plugin_watchdog_max_failed_attempts( + coresys: CoreSys, + capture_exception: Mock, + plugin: PluginBase, + error: PluginError, + container: MagicMock, + caplog: pytest.LogCaptureFixture, ) -> None: - """Test plugin watchdog rebuilds if start fails.""" - with patch.object(type(plugin.instance), "attach"), patch.object( - type(plugin.instance), "is_running", return_value=True - ): + """Test plugin watchdog gives up after max failed attempts.""" + with patch.object(type(plugin.instance), "attach"): await plugin.load() + container.status = "stopped" + container.attrs = {"State": {"ExitCode": 1}} with patch("supervisor.plugins.base.WATCHDOG_RETRY_SECONDS", 0), patch.object( - type(plugin), "rebuild" - ) as rebuild, patch.object( type(plugin), "start", side_effect=error - ) as start, patch.object( - type(plugin.instance), - "current_state", - side_effect=[ - ContainerState.STOPPED, - ContainerState.STOPPED, - ], - ): - coresys.bus.fire_event( - BusEvent.DOCKER_CONTAINER_STATE_CHANGE, + ) as start: + await plugin.watchdog_container( DockerContainerStateEvent( name=plugin.instance.name, - state=ContainerState.STOPPED, + state=ContainerState.FAILED, id="abc123", time=1, - ), + ) ) - await asyncio.sleep(0.1) - start.assert_called_once() - rebuild.assert_called_once() + assert start.call_count == 5 - capture_exception.assert_called_once_with(error) + capture_exception.assert_called_with(error) + assert ( + f"Watchdog cannot restart {plugin.slug} plugin, failed all 5 attempts" + in caplog.text + ) @pytest.mark.parametrize(