Transparency monitor (#39)

* Initial implementation without actual data evaluation * Initial implementation without actual data evaluation * Merge branch 'albireox-transparency-monitor' of github.com:sdss/lvmgort into albireox-transparency-monitor * Merge branch 'main' into albireox-transparency-monitor * Merge branch 'main' into albireox-transparency-monitor * Calculate and report transparency status and trend but no action is taken yet * Update lock * Refresh transparency data every 60 seconds * Use updated transparency API query * Try-except post-exposure routine * Add command to report transparency status * Fix import of TransparencyQuality and simplify code * Fix reference to column name in transparency monitor * Fix handling of cancelled observation when the exposure is taken * Merge branch 'main' into albireox-transparency-monitor * Merge branch 'main' into albireox-transparency-monitor * Merge branch 'main' into albireox-transparency-monitor * Use median of last 10 minutes for zp and transparency quality * Implement transparency monitor * Merge branch 'main' into albireox-transparency-monitor * Merge branch 'main' into albireox-transparency-monitor * Add retries to NPS commands * Force refresh guider status before guiding or focusing * Log sci transparency during monitoring * Report transparency quality flag name, not value * Add start/stop-transparency commands * Merge branch 'main' into albireox-transparency-monitor * Do not alert of open door * Release dome lock if dome fails to move * Add LOCKED alert to the list of active alerts * Add timeout for non-critical tasks in handle_unsafe * Better handling of failed or cancelled calibrations * Update CHANGELOG.md
sdss · Nov 30, 2024 · 8e845d2 · 8e845d2
1 parent 1a392c2
commit 8e845d2
Show file tree

Hide file tree

Showing 13 changed files with 597 additions and 52 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### 🚀 New
 
 * Add `observer schedule-focus-sweep` command to Overwatcher actor to schedule a focus sweep before the next tile.
+* [#39](https://github.com/sdss/lvmgort/pull/39) Implement transparency monitoring.
 
 ### ✨ Improved
 

diff --git a/src/gort/devices/guider.py b/src/gort/devices/guider.py
@@ -58,6 +58,13 @@ def telescope(self):
 
         return self.gort.telescopes[self.name]
 
+    async def update_status(self):
+        """Returns the guider status."""
+
+        status_reply = await self.actor.commands.status()
+
+        return status_reply.flatten()
+
     async def _status_cb(self, reply: AMQPReply):
         """Listens to guider keywords and updates the internal state."""
 
@@ -177,6 +184,8 @@ async def focus(
             await self.actor.commands.adjust_focus(reply_callback=reply_callback)
             return
 
+        await self.update_status()
+
         if self.status & GuiderStatus.NON_IDLE:
             self.write_to_log(
                 "Guider is not idle. Stopping it before focusing.",
@@ -279,6 +288,8 @@ async def guide(
 
         self.separation = None
 
+        await self.update_status()
+
         if self.status & GuiderStatus.NON_IDLE:
             raise GortGuiderError(
                 "Guider is not IDLE",

diff --git a/src/gort/etc/actor_schema.json b/src/gort/etc/actor_schema.json
@@ -54,6 +54,21 @@
         "stage": { "oneOf": [{ "type": "string" }, { "type": "null" }] },
         "standard_no": { "oneOf": [{ "type": "number" }, { "type": "null" }] }
       }
+    },
+    "transparency": {
+      "type": "object",
+      "properties": {
+        "telescope": { "type": "string" },
+        "zero_point": { "oneOf": [{ "type": "number" }, { "type": "null" }] },
+        "quality": {
+          "type": "string",
+          "enum": ["BAD", "POOR", "GOOD", "UNKNOWN"]
+        },
+        "trend": {
+          "type": "string",
+          "enum": ["IMPROVING", "WORSENING", "FLAT", "UNKNOWN"]
+        }
+      }
     }
   },
   "additionalProperties": false

diff --git a/src/gort/observer.py b/src/gort/observer.py
@@ -463,7 +463,7 @@ async def observe_tile(
 
         except GortObserverCancelledError:
             write_log("Observation cancelled.", "warning")
-            failed = True
+            failed = len(exposures) == 0
 
         except KeyboardInterrupt:
             write_log("Observation interrupted by user.", "warning")

diff --git a/src/gort/overwatcher/__init__.py b/src/gort/overwatcher/__init__.py
@@ -18,4 +18,5 @@
 from .observer import ObserverOverwatcher
 from .overwatcher import Overwatcher
 from .safety import SafetyOverwatcher
+from .transparency import TransparencyOverwatcher
 from .weather import WeatherOverwatcher
diff --git a/src/gort/overwatcher/actor/commands.py b/src/gort/overwatcher/actor/commands.py
@@ -8,6 +8,8 @@
 
 from __future__ import annotations
 
+import asyncio
+import math
 import time
 
 from typing import TYPE_CHECKING, Any
@@ -229,3 +231,76 @@ async def schedule_focus_sweep(command: OverwatcherCommand):
     command.actor.overwatcher.observer.force_focus = True
 
     return command.finish()
+
+
+@overwatcher_cli.group()
+def transparency():
+    """Transparency commands."""
+
+    pass
+
+
+@transparency.command(name="status")
+async def transparency_status(command: OverwatcherCommand):
+    """Reports the transparency status of the science telescope."""
+
+    overwatcher = command.actor.overwatcher
+    transparency = overwatcher.transparency
+
+    now = time.time()
+    if transparency.last_updated < now - 120:
+        command.warning("Transparency data is stale.")
+        return command.finish(
+            transparency={
+                "telescope": "sci",
+                "mean_zp": None,
+                "quality": "unknown",
+                "trend": "unknown",
+            }
+        )
+
+    zp = transparency.zero_point["sci"]
+
+    return command.finish(
+        transparency={
+            "telescope": "sci",
+            "mean_zp": None if math.isnan(zp) else round(zp, 2),
+            "quality": transparency.get_quality_string("sci"),
+            "trend": transparency.get_trend_string("sci"),
+        }
+    )
+
+
+@transparency.command()
+async def start_monitoring(command: OverwatcherCommand):
+    """Starts monitoring the transparency."""
+
+    overwatcher = command.actor.overwatcher
+
+    if not overwatcher.transparency.is_monitoring():
+        await overwatcher.transparency.start_monitoring()
+        command.info("Starting transparency monitoring.")
+
+    elapsed: float = 0
+    while True:
+        if not overwatcher.transparency.is_monitoring():
+            return command.finish("Transparency monitoring has been stopped.")
+
+        await asyncio.sleep(1)
+        elapsed += 1
+
+        if elapsed >= 30:
+            await command.child_command("transparency status")
+            elapsed = 0
+
+
+@transparency.command()
+async def stop_monitoring(command: OverwatcherCommand):
+    """Stops monitoring the transparency."""
+
+    overwatcher = command.actor.overwatcher
+
+    if overwatcher.transparency.is_monitoring():
+        await overwatcher.transparency.stop_monitoring()
+
+    return command.finish()
diff --git a/src/gort/overwatcher/alerts.py b/src/gort/overwatcher/alerts.py
@@ -55,6 +55,7 @@ class ActiveAlert(enum.Flag):
     DOOR = enum.auto()
     CAMERA_TEMPERATURE = enum.auto()
     O2 = enum.auto()
+    LOCKED = enum.auto()
     UNKNOWN = enum.auto()
 
 
@@ -139,12 +140,6 @@ def is_safe(self) -> tuple[bool, ActiveAlert]:
             self.log.warning("Alerts data not available. is_safe() returns False.")
             return False, ActiveAlert.UNKNOWN
 
-        # If we have issued a previous unsafe alert, the main task will close the dome
-        # and put a lock for 30 minutes to prevent the dome from opening/closing too
-        # frequently if the weather is unstable.
-        if self.locked_until > 0 and time() < self.locked_until:
-            return False, ActiveAlert(0)
-
         is_safe: bool = True
         active_alerts = ActiveAlert(0)
 
@@ -168,7 +163,6 @@ def is_safe(self) -> tuple[bool, ActiveAlert]:
         # These alerts are not critical but we log them.
         # TODO: maybe we do want to do something about these alerts.
         if self.state.door_alert:
-            self.log.warning("Door alert detected.")
             active_alerts |= ActiveAlert.DOOR
         if self.state.camera_temperature_alert:
             self.log.warning("Camera temperature alert detected.")
@@ -177,6 +171,12 @@ def is_safe(self) -> tuple[bool, ActiveAlert]:
             self.log.warning("O2 alert detected.")
             active_alerts |= ActiveAlert.O2
 
+        # If we have issued a previous unsafe alert, the main task will close the dome
+        # and put a lock for 30 minutes to prevent the dome from opening/closing too
+        # frequently if the weather is unstable.
+        if self.locked_until > 0 and time() < self.locked_until:
+            return False, active_alerts | ActiveAlert.LOCKED
+
         if is_safe:
             self.locked_until = 0
 

diff --git a/src/gort/overwatcher/calibration.py b/src/gort/overwatcher/calibration.py
@@ -475,24 +475,26 @@ async def task(self):
                         self.module.run_calibration(next_calibration)
                     )
                     await self.module._calibration_task
+
                 except asyncio.CancelledError:
-                    await notify(
-                        f"Calibration {name} has been cancelled.",
-                        level="warning",
-                    )
-                    next_calibration.record_state(
-                        CalibrationState.CANCELLED,
-                        fail_reason="calibration cancelled by Overwatcher or user.",
-                    )
+                    if not next_calibration.is_finished():
+                        await notify(
+                            f"Calibration {name} has been cancelled.",
+                            level="warning",
+                        )
+                        next_calibration.record_state(CalibrationState.CANCELLED)
+
                 except Exception as ee:
-                    await notify(
-                        f"Error running calibration {name}: {ee}",
-                        level="error",
-                    )
-                    next_calibration.record_state(
-                        CalibrationState.FAILED,
-                        fail_reason=str(ee),
-                    )
+                    if not next_calibration.is_finished():
+                        await notify(
+                            f"Error running calibration {name}: {ee}",
+                            level="error",
+                        )
+                        next_calibration.record_state(
+                            CalibrationState.FAILED,
+                            fail_reason=str(ee),
+                        )
+
                 finally:
                     if next_calibration.is_finished():
                         dome_closed = await self.module.overwatcher.dome.is_closing()
@@ -711,6 +713,8 @@ async def cancel(self):
             await notify(f"Cancelling calibration {name}.", level="warning")
             self._calibration_task = await cancel_task(self._calibration_task)
 
+            running_calibration.record_state(CalibrationState.CANCELLED)
+
             # Ensure we close the dome. This is allowed even
             # if the overwatcher is disabled.
             if running_calibration.model.close_dome_after:

diff --git a/src/gort/overwatcher/helpers/dome.py b/src/gort/overwatcher/helpers/dome.py
@@ -180,6 +180,11 @@ async def _move(
                     "it may be partially or fully open.",
                     level="critical",
                 )
+
+                # Release the lock here. force_disable() may require closing the dome.
+                if self._move_lock.locked():
+                    self._move_lock.release()
+
                 await self.overwatcher.force_disable()
                 raise
 

diff --git a/src/gort/overwatcher/observer.py b/src/gort/overwatcher/observer.py
@@ -19,6 +19,7 @@
 from gort.exposure import Exposure
 from gort.overwatcher import OverwatcherModule
 from gort.overwatcher.core import OverwatcherModuleTask
+from gort.overwatcher.transparency import TransparencyQuality
 from gort.tile import Tile
 from gort.tools import cancel_task, run_in_executor
 
@@ -236,8 +237,6 @@ async def observe_loop_task(self):
         n_tile_positions = 0
 
         while True:
-            exp: Exposure | bool = False
-
             try:
                 # Wait in case the troubleshooter is doing something.
                 await self.overwatcher.troubleshooter.wait_until_ready(300)
@@ -254,6 +253,8 @@ async def observe_loop_task(self):
                 await self.check_focus(force=n_tile_positions == 0 or self.force_focus)
 
                 for dpos in tile.dither_positions:
+                    exp: Exposure | bool = False
+
                     await self.overwatcher.troubleshooter.wait_until_ready(300)
 
                     if not self.check_twilight():
@@ -293,6 +294,14 @@ async def observe_loop_task(self):
                     if result and len(exps) > 0:
                         exp = exps[0]
 
+                    try:
+                        await self.post_exposure(exp)
+                    except Exception as err:
+                        await self.notify(
+                            f"Failed to run post-exposure routine: {err}",
+                            level="error",
+                        )
+
                     if self.is_cancelling:
                         break
 
@@ -402,3 +411,62 @@ async def pre_observe_checks(self):
             await self.gort.specs.reset()
 
         return True
+
+    async def post_exposure(self, exp: Exposure | bool):
+        """Runs post-exposure checks."""
+
+        if exp is False:
+            raise GortError("No exposure was returned.")
+
+        # Output transparency data for the last exposure.
+        transparency = self.overwatcher.transparency
+        transparency.write_to_log(["sci"])
+
+        if self._cancelling:
+            return
+
+        if transparency.quality["sci"] & TransparencyQuality.BAD:
+            await self.notify(
+                "Transparency is bad. Stopping observations and starting "
+                "the transparency monitor.",
+            )
+
+            # If we reach twilight this will cause the overwatcher
+            # to immediately stop observations.
+            self.exposure_completes = 0
+
+            try:
+                await asyncio.wait_for(
+                    transparency.start_monitoring(),
+                    timeout=3600,
+                )
+
+            except asyncio.TimeoutError:
+                await self.notify("Transparency monitor timed out.", level="warning")
+                await self.overwatcher.shutdown(
+                    reason="Transparency has been bad for over one hour.",
+                    disable_overwatcher=True,
+                )
+
+            else:
+                # The transparency monitor has ended. There are two possible reasons:
+
+                # - Something stopped the observing loop and with it the monitor.
+                #   Do nothing and return. The main task will handle the rest.
+                if self._cancelling:
+                    return
+
+                # - The transparency is good and the monitor has ended.
+                if transparency.quality["sci"] & TransparencyQuality.GOOD:
+                    await self.notify("Transparency is good. Resuming observations.")
+                    return
+
+                else:
+                    await self.notify(
+                        "Transparency is still bad but the monitor stopped. "
+                        "Triggering shutdown.",
+                    )
+                    await self.overwatcher.shutdown(
+                        reason="Transparency monitor failed.",
+                        disable_overwatcher=True,
+                    )