Skip to content

Commit

Permalink
PTFE-1798 handling errors of no such index (#617)
Browse files Browse the repository at this point in the history
If the redis instance happens to reboot during the runtime of the runner
manager, redis will need for the `Migrator().run()` function to run to
re-create the indexes. Otherwise we will face an interruption of service
with the error:

```shell
redis.exceptions.ResponseError: runner-manager:runner_manager.models.runner_group.RunnerGroup:index: no such index
```

The following actions were taken in the code:
- Reduce the index interval from one hour to 15 minutes.
- Add more context to the indexing function, with this information.
- When failing to contact the redis instance, on the healtcheck,
schedule a index run for a more dynamic self-healing approach.
  • Loading branch information
tcarmet authored Jun 4, 2024
1 parent 81dd407 commit 8f1c99a
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 5 deletions.
2 changes: 1 addition & 1 deletion manifests/base/runner-manager/redis/statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ spec:
name: redis
volumeMounts:
- name: redis-data
mountPath: /var/lib/redis-stack
mountPath: /data
resources:
requests:
memory: 256Mi
Expand Down
13 changes: 12 additions & 1 deletion runner_manager/jobs/startup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def bootstrap_scheduler(
for job in jobs:
# Cancel any existing healthcheck jobs
job_type = job.meta.get("type")
if job_type == "healthcheck" or job_type == "migrator" or job_type == "leaks":
if job_type == "healthcheck" or job_type == "indexing" or job_type == "leaks":
log.info(f"Canceling {job_type} job: {job.id}")
scheduler.cancel(job)

Expand Down Expand Up @@ -106,7 +106,18 @@ def bootstrap_scheduler(


def indexing():
"""For RedisSearch to work, we need to run the Migrator to create the indexes.
This job is required when:
- Upon the first creation of the redis instance.
- A new schema is introduced.
- The Redis instance is rebooted.
- Changes are made to the RedisSearch schema.
"""

log.info("Running indexing job...")
Migrator().run()
log.info("Indexing job complete.")


def startup(settings: Settings = get_settings()):
Expand Down
2 changes: 1 addition & 1 deletion runner_manager/models/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class Settings(BaseSettings):
timeout_runner: timedelta = timedelta(minutes=15)
time_to_live: Optional[timedelta] = timedelta(hours=12)
healthcheck_interval: timedelta = timedelta(minutes=15)
indexing_interval: timedelta = timedelta(hours=1)
indexing_interval: timedelta = timedelta(minutes=15)
github_base_url: Optional[AnyHttpUrl] = Field(default="https://api.github.com")
github_webhook_secret: Optional[SecretStr] = None
github_token: Optional[SecretStr] = None
Expand Down
10 changes: 8 additions & 2 deletions runner_manager/routers/_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,28 @@

from fastapi import APIRouter, Depends, Response
from redis import Redis
from rq import Queue, Retry

from runner_manager.dependencies import get_redis
from runner_manager.dependencies import get_queue, get_redis
from runner_manager.jobs.startup import indexing

router = APIRouter(prefix="/_health")

log = logging.getLogger(__name__)


@router.get("/", status_code=200)
def healthcheck(r: Redis = Depends(get_redis)):
def healthcheck(r: Redis = Depends(get_redis), queue: Queue = Depends(get_queue)):
"""Healthcheck endpoint that answers to GET requests on /_health"""

try:
r.ping()
except Exception as exp:
log.error("Redis healthcheck failed: %s", exp)
# In the case where redis is rebooting
# when the service will be back up,
# it will need to create indexes for search to work
queue.enqueue(indexing, retry=Retry(max=3, interval=[30, 60, 120]))
return Response(status_code=500)

return Response(status_code=200)
10 changes: 10 additions & 0 deletions tests/api/test_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,13 @@ def test_healthcheck_redis_unavailable(client, fastapp):
fastapp.dependency_overrides[get_redis] = get_redis
response = client.get("/_health/")
assert response.status_code == 200


def test_index_job_schedule(client, queue, monkeypatch):
"""When the healthcheck fails to ping redis, it should schedule an indexing job"""
# patch redis ping to raise an exception
monkeypatch.setattr(Redis, "ping", lambda self: 1 / 0)
finished_jobs = queue.finished_job_registry.count
response = client.get("/_health/")
assert response.status_code == 500
assert queue.finished_job_registry.count == finished_jobs + 1

0 comments on commit 8f1c99a

Please sign in to comment.