diff --git a/Makefile b/Makefile index 144f9288..4e271a4d 100644 --- a/Makefile +++ b/Makefile @@ -93,7 +93,7 @@ server-migrate: DJANGO_READ_DOT_ENV_FILE=True $(PYTHON) $(PYDIR)/manage.py migrate -v 3 serve: - DJANGO_READ_DOT_ENV_FILE=True $(PYTHON) $(PYDIR)/manage.py runserver 127.0.0.1:8001 + DJANGO_READ_DOT_ENV_FILE=True $(PYTHON) $(PYDIR)/manage.py runserver 0.0.0.0:8001 server-static: mkdir -p ./yupana/static/client diff --git a/docker-compose.yml b/docker-compose.yml index d1c4d9d5..32f57824 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,4 +33,5 @@ services: - '9090:9090' volumes: - './scripts/config/prometheus.yml:/etc/prometheus/prometheus.yml' + - './scripts/config/prometheus_rules.yml:/etc/prometheus/prometheus_rules.yml' image: prom/prometheus \ No newline at end of file diff --git a/scripts/config/prometheus.yml b/scripts/config/prometheus.yml index 732d910b..ea4c1ffc 100644 --- a/scripts/config/prometheus.yml +++ b/scripts/config/prometheus.yml @@ -20,4 +20,7 @@ scrape_configs: # for docker versions under 18.03 onwards you can use host.docker.internal # to point at your localhost from the container # for version 17.06 - 18.03 we use docker.for.mac.localhost - - targets: ['docker.for.mac.localhost:8001'] # Uses /metrics by default + - targets: ['docker.for.mac.localhost:8001','172.17.0.1:8001'] # Uses /metrics by default + +rule_files: + - "prometheus_rules.yml" \ No newline at end of file diff --git a/scripts/config/prometheus_rules.yml b/scripts/config/prometheus_rules.yml new file mode 100644 index 00000000..86044a2d --- /dev/null +++ b/scripts/config/prometheus_rules.yml @@ -0,0 +1,10 @@ +groups: + - name: alerting_rules + rules: + - alert: YupanaProcessorDead + expr: rate(yupana_processor_dead_total[5m]) > 0 + labels: + severity: major + annotations: + summary: "Yupana processor dead" + description: "Yupana processor dead." diff --git a/yupana/api/status/view.py b/yupana/api/status/view.py index c29d4018..6b25cde0 100644 --- a/yupana/api/status/view.py +++ b/yupana/api/status/view.py @@ -21,6 +21,7 @@ from processor.processor_utils import (format_message, list_name_of_active_threads, list_name_of_processors) +from prometheus_client import Counter from rest_framework import permissions, status as http_status from rest_framework.decorators import api_view, permission_classes from rest_framework.response import Response @@ -29,6 +30,8 @@ from api.status.serializer import StatusSerializer LOG = logging.getLogger(__name__) +PROCESSOR_DEAD_EXCEPTION = Counter('yupana_processor_dead', + 'Total number of time yupana process thread dies') @api_view(['GET', 'HEAD']) @@ -88,6 +91,7 @@ def status(request): active_threads_names = list_name_of_active_threads() if not all(item in active_threads_names for item in total_processors_names): dead_processors = set(total_processors_names).difference(active_threads_names) + PROCESSOR_DEAD_EXCEPTION.inc() LOG.error(format_message('SERVICE STATUS', 'Dead processors - %s' % dead_processors)) return Response('ERROR: Processor thread exited', status=http_status.HTTP_500_INTERNAL_SERVER_ERROR)