Skip to content

Commit

Permalink
Kubernetes integration and graceful shutdown (#960)
Browse files Browse the repository at this point in the history
* Refactor server shutdown method

* Move shutdown notification to BroadcastService

* Add framework for graceful shutdown

* Wait for connections to drain during graceful shutdown

* Add signal handler to allow cancellation of drain task

* Kick idle players during graceful shutdown

* Abort connections if they don't login within a certain timeout

* Prevent new games from being created during graceful shutdown

* Prevent players from joining matchmaker during graceful shutdown

* Add kubernetes readiness endpoint to control server

* Add health server and refactor control server

* Wait for all games to end during graceful shutdown

* Close games that are in lobby during graceful shutdown

* Notify players when the graceful shutdown period starts

* Add integration test for graceful shutdown

* Add example kubernetes config file

* Misc refactor

* Add info module for collecting static info
  • Loading branch information
Askaholic authored Oct 22, 2023
1 parent ed5fb65 commit 883dda8
Show file tree
Hide file tree
Showing 34 changed files with 1,122 additions and 260 deletions.
93 changes: 61 additions & 32 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,53 +10,65 @@
import asyncio
import logging
import os
import platform
import signal
import sys
import time
from datetime import datetime
from functools import wraps

import humanize
from docopt import docopt
from prometheus_client import start_http_server

import server
from server import info
from server.config import config
from server.control import ControlServer
from server.game_service import GameService
from server.health import HealthServer
from server.ice_servers.nts import TwilioNTS
from server.player_service import PlayerService
from server.profiler import Profiler
from server.protocol import QDataStreamProtocol, SimpleJsonProtocol


def log_signal(func):
@wraps(func)
def wrapped(sig, frame):
logger.info("Received signal %s", signal.Signals(sig))
return func(sig, frame)

return wrapped


async def main():
global startup_time, shutdown_time

version = os.environ.get("VERSION") or "dev"
python_version = platform.python_version()

logger.info(
"Lobby %s (Python %s) on %s",
version,
python_version,
sys.platform
"Lobby %s (Python %s) on %s named %s",
info.VERSION,
info.PYTHON_VERSION,
sys.platform,
info.CONTAINER_NAME,
)

if config.ENABLE_METRICS:
logger.info("Using prometheus on port: %i", config.METRICS_PORT)
start_http_server(config.METRICS_PORT)

loop = asyncio.get_running_loop()
done = loop.create_future()

logger.info("Event loop: %s", loop)

def signal_handler(sig: int, _frame):
logger.info(
"Received signal %s, shutting down",
signal.Signals(sig)
)
@log_signal
def done_handler(sig: int, frame):
if not done.done():
done.set_result(0)

# Make sure we can shutdown gracefully
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, done_handler)
signal.signal(signal.SIGINT, done_handler)

database = server.db.FAFDatabase(
host=config.DB_SERVER,
Expand Down Expand Up @@ -91,19 +103,21 @@ def signal_handler(sig: int, _frame):
config.register_callback("PROFILING_DURATION", profiler.refresh)
config.register_callback("PROFILING_INTERVAL", profiler.refresh)

await instance.start_services()

ctrl_server = await server.run_control_server(player_service, game_service)
health_server = HealthServer(instance)
await health_server.run_from_config()
config.register_callback(
"HEALTH_SERVER_PORT",
health_server.run_from_config
)

async def restart_control_server():
nonlocal ctrl_server
control_server = ControlServer(instance)
await control_server.run_from_config()
config.register_callback(
"CONTROL_SERVER_PORT",
control_server.run_from_config
)

await ctrl_server.shutdown()
ctrl_server = await server.run_control_server(
player_service,
game_service
)
config.register_callback("CONTROL_SERVER_PORT", restart_control_server)
await instance.start_services()

PROTO_CLASSES = {
QDataStreamProtocol.__name__: QDataStreamProtocol,
Expand Down Expand Up @@ -135,8 +149,8 @@ async def restart_control_server():
)

server.metrics.info.info({
"version": version,
"python_version": python_version,
"version": info.VERSION,
"python_version": info.PYTHON_VERSION,
"start_time": datetime.utcnow().strftime("%m-%d %H:%M"),
"game_uid": str(game_service.game_id_counter)
})
Expand All @@ -150,12 +164,27 @@ async def restart_control_server():
shutdown_time = time.perf_counter()

# Cleanup
await instance.shutdown()
await ctrl_server.shutdown()
await instance.graceful_shutdown()

drain_task = asyncio.create_task(instance.drain())

# Close DB connections
@log_signal
def drain_handler(sig: int, frame):
if not drain_task.done():
drain_task.cancel()

# Allow us to force shut down by skipping the drain
signal.signal(signal.SIGTERM, drain_handler)
signal.signal(signal.SIGINT, drain_handler)

await drain_task
await instance.shutdown()
await control_server.shutdown()
await database.close()

# Health server should be the last thing to shut down
await health_server.shutdown()

return exit_code


Expand Down Expand Up @@ -191,7 +220,7 @@ async def restart_control_server():
stop_time = time.perf_counter()
logger.info(
"Total server uptime: %s",
humanize.naturaldelta(stop_time - startup_time)
humanize.precisedelta(stop_time - startup_time)
)

if shutdown_time is not None:
Expand Down
83 changes: 83 additions & 0 deletions minikube-example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
apiVersion: v1
kind: Service
metadata:
name: faf-lobby
labels:
app: faf-lobby
spec:
type: NodePort
selector:
app: faf-lobby
ports:
- port: 8001
name: qstream
- port: 8002
name: simplejson
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: faf-lobby
spec:
replicas: 1
selector:
matchLabels:
app: faf-lobby
template:
metadata:
labels:
app: faf-lobby
spec:
terminationGracePeriodSeconds: 310
containers:
- name: faf-python-server
image: faf-python-server:graceful
imagePullPolicy: Never
readinessProbe:
httpGet:
path: /ready
port: health
initialDelaySeconds: 4
periodSeconds: 1
ports:
- containerPort: 4000
name: control
- containerPort: 2000
name: health
- containerPort: 8001
name: qstream
- containerPort: 8002
name: simplejson
env:
- name: CONFIGURATION_FILE
value: /config/config.yaml
- name: CONTAINER_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: config
mountPath: /config
readOnly: true
volumes:
- name: config
configMap:
name: minikube-dev-config
items:
- key: config.yaml
path: config.yaml
---
apiVersion: v1
kind: ConfigMap
metadata:
name: minikube-dev-config
data:
config.yaml: |
LOG_LEVEL: TRACE
USE_POLICY_SERVER: false
QUEUE_POP_TIME_MAX: 30
SHUTDOWN_GRACE_PERIOD: 300
SHUTDOWN_KICK_IDLE_PLAYERS: true
DB_SERVER: host.minikube.internal
MQ_SERVER: host.minikube.internal
Loading

0 comments on commit 883dda8

Please sign in to comment.