From 8f1c99a7dd5a98d35dd1e6341df858b872ab7e55 Mon Sep 17 00:00:00 2001
From: Thomas Carmet <8408330+tcarmet@users.noreply.github.com>
Date: Tue, 4 Jun 2024 11:22:08 -0700
Subject: [PATCH] PTFE-1798 handling errors of no such index (#617)

If the redis instance happens to reboot during the runtime of the runner
manager, redis will need for the `Migrator().run()` function to run to
re-create the indexes. Otherwise we will face an interruption of service
with the error:

```shell
redis.exceptions.ResponseError: runner-manager:runner_manager.models.runner_group.RunnerGroup:index: no such index
```

The following actions were taken in the code:
- Reduce the index interval from one hour to 15 minutes.
- Add more context to the indexing function, with this information.
- When failing to contact the redis instance, on the healtcheck,
schedule a index run for a more dynamic self-healing approach.
---
 .../base/runner-manager/redis/statefulset.yaml      |  2 +-
 runner_manager/jobs/startup.py                      | 13 ++++++++++++-
 runner_manager/models/settings.py                   |  2 +-
 runner_manager/routers/_health.py                   | 10 ++++++++--
 tests/api/test_health.py                            | 10 ++++++++++
 5 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/manifests/base/runner-manager/redis/statefulset.yaml b/manifests/base/runner-manager/redis/statefulset.yaml
index 7b83b9ec..ab6b9742 100644
--- a/manifests/base/runner-manager/redis/statefulset.yaml
+++ b/manifests/base/runner-manager/redis/statefulset.yaml
@@ -19,7 +19,7 @@ spec:
               name: redis
           volumeMounts:
             - name: redis-data
-              mountPath: /var/lib/redis-stack
+              mountPath: /data
           resources:
             requests:
               memory: 256Mi
diff --git a/runner_manager/jobs/startup.py b/runner_manager/jobs/startup.py
index 40a7d349..f75a815a 100644
--- a/runner_manager/jobs/startup.py
+++ b/runner_manager/jobs/startup.py
@@ -54,7 +54,7 @@ def bootstrap_scheduler(
     for job in jobs:
         # Cancel any existing healthcheck jobs
         job_type = job.meta.get("type")
-        if job_type == "healthcheck" or job_type == "migrator" or job_type == "leaks":
+        if job_type == "healthcheck" or job_type == "indexing" or job_type == "leaks":
             log.info(f"Canceling {job_type} job: {job.id}")
             scheduler.cancel(job)
 
@@ -106,7 +106,18 @@ def bootstrap_scheduler(
 
 
 def indexing():
+    """For RedisSearch to work, we need to run the Migrator to create the indexes.
+
+    This job is required when:
+    - Upon the first creation of the redis instance.
+    - A new schema is introduced.
+    - The Redis instance is rebooted.
+    - Changes are made to the RedisSearch schema.
+    """
+
+    log.info("Running indexing job...")
     Migrator().run()
+    log.info("Indexing job complete.")
 
 
 def startup(settings: Settings = get_settings()):
diff --git a/runner_manager/models/settings.py b/runner_manager/models/settings.py
index 30297849..5f96eb13 100644
--- a/runner_manager/models/settings.py
+++ b/runner_manager/models/settings.py
@@ -34,7 +34,7 @@ class Settings(BaseSettings):
     timeout_runner: timedelta = timedelta(minutes=15)
     time_to_live: Optional[timedelta] = timedelta(hours=12)
     healthcheck_interval: timedelta = timedelta(minutes=15)
-    indexing_interval: timedelta = timedelta(hours=1)
+    indexing_interval: timedelta = timedelta(minutes=15)
     github_base_url: Optional[AnyHttpUrl] = Field(default="https://api.github.com")
     github_webhook_secret: Optional[SecretStr] = None
     github_token: Optional[SecretStr] = None
diff --git a/runner_manager/routers/_health.py b/runner_manager/routers/_health.py
index 916ded57..37dde533 100644
--- a/runner_manager/routers/_health.py
+++ b/runner_manager/routers/_health.py
@@ -2,8 +2,10 @@
 
 from fastapi import APIRouter, Depends, Response
 from redis import Redis
+from rq import Queue, Retry
 
-from runner_manager.dependencies import get_redis
+from runner_manager.dependencies import get_queue, get_redis
+from runner_manager.jobs.startup import indexing
 
 router = APIRouter(prefix="/_health")
 
@@ -11,13 +13,17 @@
 
 
 @router.get("/", status_code=200)
-def healthcheck(r: Redis = Depends(get_redis)):
+def healthcheck(r: Redis = Depends(get_redis), queue: Queue = Depends(get_queue)):
     """Healthcheck endpoint that answers to GET requests on /_health"""
 
     try:
         r.ping()
     except Exception as exp:
         log.error("Redis healthcheck failed: %s", exp)
+        # In the case where redis is rebooting
+        # when the service will be back up,
+        # it will need to create indexes for search to work
+        queue.enqueue(indexing, retry=Retry(max=3, interval=[30, 60, 120]))
         return Response(status_code=500)
 
     return Response(status_code=200)
diff --git a/tests/api/test_health.py b/tests/api/test_health.py
index 33b6f555..d719c4b6 100644
--- a/tests/api/test_health.py
+++ b/tests/api/test_health.py
@@ -16,3 +16,13 @@ def test_healthcheck_redis_unavailable(client, fastapp):
     fastapp.dependency_overrides[get_redis] = get_redis
     response = client.get("/_health/")
     assert response.status_code == 200
+
+
+def test_index_job_schedule(client, queue, monkeypatch):
+    """When the healthcheck fails to ping redis, it should schedule an indexing job"""
+    # patch redis ping to raise an exception
+    monkeypatch.setattr(Redis, "ping", lambda self: 1 / 0)
+    finished_jobs = queue.finished_job_registry.count
+    response = client.get("/_health/")
+    assert response.status_code == 500
+    assert queue.finished_job_registry.count == finished_jobs + 1