From 8f1c99a7dd5a98d35dd1e6341df858b872ab7e55 Mon Sep 17 00:00:00 2001 From: Thomas Carmet <8408330+tcarmet@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:22:08 -0700 Subject: [PATCH] PTFE-1798 handling errors of no such index (#617) If the redis instance happens to reboot during the runtime of the runner manager, redis will need for the `Migrator().run()` function to run to re-create the indexes. Otherwise we will face an interruption of service with the error: ```shell redis.exceptions.ResponseError: runner-manager:runner_manager.models.runner_group.RunnerGroup:index: no such index ``` The following actions were taken in the code: - Reduce the index interval from one hour to 15 minutes. - Add more context to the indexing function, with this information. - When failing to contact the redis instance, on the healtcheck, schedule a index run for a more dynamic self-healing approach. --- .../base/runner-manager/redis/statefulset.yaml | 2 +- runner_manager/jobs/startup.py | 13 ++++++++++++- runner_manager/models/settings.py | 2 +- runner_manager/routers/_health.py | 10 ++++++++-- tests/api/test_health.py | 10 ++++++++++ 5 files changed, 32 insertions(+), 5 deletions(-) diff --git a/manifests/base/runner-manager/redis/statefulset.yaml b/manifests/base/runner-manager/redis/statefulset.yaml index 7b83b9ec..ab6b9742 100644 --- a/manifests/base/runner-manager/redis/statefulset.yaml +++ b/manifests/base/runner-manager/redis/statefulset.yaml @@ -19,7 +19,7 @@ spec: name: redis volumeMounts: - name: redis-data - mountPath: /var/lib/redis-stack + mountPath: /data resources: requests: memory: 256Mi diff --git a/runner_manager/jobs/startup.py b/runner_manager/jobs/startup.py index 40a7d349..f75a815a 100644 --- a/runner_manager/jobs/startup.py +++ b/runner_manager/jobs/startup.py @@ -54,7 +54,7 @@ def bootstrap_scheduler( for job in jobs: # Cancel any existing healthcheck jobs job_type = job.meta.get("type") - if job_type == "healthcheck" or job_type == "migrator" or job_type == "leaks": + if job_type == "healthcheck" or job_type == "indexing" or job_type == "leaks": log.info(f"Canceling {job_type} job: {job.id}") scheduler.cancel(job) @@ -106,7 +106,18 @@ def bootstrap_scheduler( def indexing(): + """For RedisSearch to work, we need to run the Migrator to create the indexes. + + This job is required when: + - Upon the first creation of the redis instance. + - A new schema is introduced. + - The Redis instance is rebooted. + - Changes are made to the RedisSearch schema. + """ + + log.info("Running indexing job...") Migrator().run() + log.info("Indexing job complete.") def startup(settings: Settings = get_settings()): diff --git a/runner_manager/models/settings.py b/runner_manager/models/settings.py index 30297849..5f96eb13 100644 --- a/runner_manager/models/settings.py +++ b/runner_manager/models/settings.py @@ -34,7 +34,7 @@ class Settings(BaseSettings): timeout_runner: timedelta = timedelta(minutes=15) time_to_live: Optional[timedelta] = timedelta(hours=12) healthcheck_interval: timedelta = timedelta(minutes=15) - indexing_interval: timedelta = timedelta(hours=1) + indexing_interval: timedelta = timedelta(minutes=15) github_base_url: Optional[AnyHttpUrl] = Field(default="https://api.github.com") github_webhook_secret: Optional[SecretStr] = None github_token: Optional[SecretStr] = None diff --git a/runner_manager/routers/_health.py b/runner_manager/routers/_health.py index 916ded57..37dde533 100644 --- a/runner_manager/routers/_health.py +++ b/runner_manager/routers/_health.py @@ -2,8 +2,10 @@ from fastapi import APIRouter, Depends, Response from redis import Redis +from rq import Queue, Retry -from runner_manager.dependencies import get_redis +from runner_manager.dependencies import get_queue, get_redis +from runner_manager.jobs.startup import indexing router = APIRouter(prefix="/_health") @@ -11,13 +13,17 @@ @router.get("/", status_code=200) -def healthcheck(r: Redis = Depends(get_redis)): +def healthcheck(r: Redis = Depends(get_redis), queue: Queue = Depends(get_queue)): """Healthcheck endpoint that answers to GET requests on /_health""" try: r.ping() except Exception as exp: log.error("Redis healthcheck failed: %s", exp) + # In the case where redis is rebooting + # when the service will be back up, + # it will need to create indexes for search to work + queue.enqueue(indexing, retry=Retry(max=3, interval=[30, 60, 120])) return Response(status_code=500) return Response(status_code=200) diff --git a/tests/api/test_health.py b/tests/api/test_health.py index 33b6f555..d719c4b6 100644 --- a/tests/api/test_health.py +++ b/tests/api/test_health.py @@ -16,3 +16,13 @@ def test_healthcheck_redis_unavailable(client, fastapp): fastapp.dependency_overrides[get_redis] = get_redis response = client.get("/_health/") assert response.status_code == 200 + + +def test_index_job_schedule(client, queue, monkeypatch): + """When the healthcheck fails to ping redis, it should schedule an indexing job""" + # patch redis ping to raise an exception + monkeypatch.setattr(Redis, "ping", lambda self: 1 / 0) + finished_jobs = queue.finished_job_registry.count + response = client.get("/_health/") + assert response.status_code == 500 + assert queue.finished_job_registry.count == finished_jobs + 1