-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add blacklist logic * update readme * delete exited nodes * add ban logic for crashloopbackoff * add ban logic for crashloopbackoff * fix BLACKLIST_RESTART_TTL_SECONDS
- Loading branch information
1 parent
cc1e3b7
commit 9ce9d9f
Showing
10 changed files
with
248 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,17 @@ | ||
services: | ||
proxy: | ||
container_name: proxy | ||
build: | ||
context: src | ||
env_file: .env | ||
ports: | ||
- 3000:3000 | ||
redis: | ||
image: redis:7.4.0-alpine3.20 | ||
container_name: redis | ||
environment: | ||
- REDIS_PASSWORD=testpassword | ||
- REDIS_USER=testuser | ||
- REDIS_USER_PASSWORD=testuserpassword | ||
ports: | ||
- 6379:6379 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,95 @@ | ||
import os | ||
import time | ||
import logging | ||
import json | ||
from jinja2 import Environment, FileSystemLoader | ||
from models.vast import VastController | ||
from models.blacklist import Blacklist | ||
from settings import Settings | ||
from typing import Optional | ||
|
||
logging.basicConfig(level=logging.DEBUG if Settings.log_debug else logging.INFO, format="%(asctime)s:%(levelname)s:%(name)s:%(message)s", datefmt="%Y-%m-%dT%H:%M:%S") | ||
logging.info({"template_name": Settings.template_name, "template_image": Settings.template_image, "pod_name": Settings.pod_name}) | ||
|
||
def getenv(envname: str, default: str | None = None): | ||
v = os.getenv(envname) | ||
if v: | ||
return v | ||
if not default: | ||
raise Exception(f"Env var {envname} is not defined") | ||
return default | ||
|
||
|
||
pod_name = getenv("POD_NAME") | ||
vast_api_key = getenv("VAST_API_KEY") | ||
template_name = getenv("VAST_TEMPLATE_NAME") | ||
template_image = getenv("VAST_TEMPLATE_IMAGE") | ||
nginx_config_path = getenv("NGINX_CONFIG_PATH", "/etc/nginx/http.d/default.conf") | ||
docker_login = getenv("DOCKER_LOGIN") | ||
vast_search_query = json.loads(getenv("VAST_SEARCH_QUERY")) | ||
nginx_listen_port = getenv("NGINX_LISTEN_PORT", "3000") | ||
nginx_max_body_size = getenv("NGINX_MAX_BODY_SIZE", "10M") | ||
log_debug = getenv("DEBUG", "false") | ||
|
||
logging.basicConfig(level=logging.DEBUG if log_debug.lower() == "true" else logging.INFO, format="%(asctime)s:%(levelname)s:%(name)s:%(message)s", datefmt="%Y-%m-%dT%H:%M:%S") | ||
logging.info({"template_name": template_name, "template_image": template_image, "pod_name": pod_name}) | ||
|
||
vast = VastController(api_key=vast_api_key) | ||
vast = VastController(api_key=Settings.vast_api_key) | ||
blacklist: Optional[Blacklist] = None | ||
if Settings.blacklist_enabled: | ||
blacklist = Blacklist(redis_url=Settings.blacklist_redis, ban_ttl=Settings.blacklist_ban_ttl_seconds) | ||
|
||
# get template by name | ||
logging.info(f"Searching template with name '{template_name}' ...") | ||
template = vast.getTemplateByName(template_name) | ||
logging.info(f"Searching template with name '{Settings.template_name}' ...") | ||
template = vast.getTemplateByName(Settings.template_name) | ||
if not template: | ||
raise Exception(f"Template with name '{template_name}' not found") | ||
raise Exception(f"Template with name '{Settings.template_name}' not found") | ||
logging.info(template) | ||
|
||
# check if instance already exists | ||
instance_label = f"k8s_pod={pod_name}" | ||
instance_label = f"k8s_pod={Settings.pod_name}" | ||
logging.info(f"Searching instance with label '{instance_label}' ...") | ||
instance = vast.getInstanceByLabel(instance_label) | ||
if instance: | ||
logging.info(instance) | ||
if instance.image != template_image or instance.template_hash_id != template.id or instance.status == "offline": | ||
logging.info("Destroying instance...") | ||
destroy_reason = "" | ||
if instance.image != Settings.template_image: | ||
destroy_reason = "instance has wrong image" | ||
elif instance.template_hash_id != template.id: | ||
destroy_reason = "instance has wrong template" | ||
elif instance.status in ["offline", "exited"]: | ||
destroy_reason = f"instance is {instance.status}" | ||
elif blacklist and blacklist.isBanned(instance.hostId): | ||
destroy_reason = "instance host is banned" | ||
elif blacklist and blacklist.getAndIncreaseInstanceRestarts(instance.id) > Settings.blacklist_restart_threshold: | ||
logging.info("Too many pod restarts, instance host will be banned for some time...") | ||
blacklist.add(instance.hostId, reason="restarts") | ||
destroy_reason = "too many pod restarts" | ||
if destroy_reason: | ||
logging.info(f"Destroying instance, {destroy_reason}...") | ||
instance.destroy() | ||
if blacklist: | ||
blacklist.cleanInstanceKeys(instance.id) | ||
instance = None | ||
else: | ||
logging.info("Instance image and template are up-to-date") | ||
|
||
# create instance if not exists or not up-to-date | ||
# TODO: find a way to get docker_login from the template | ||
if not instance: | ||
vast.createInstance(template=template, label=instance_label, docker_login=docker_login, search_query=vast_search_query, image=template_image) | ||
vast.createInstance( | ||
template=template, | ||
label=instance_label, | ||
docker_login=Settings.docker_login, | ||
search_query=Settings.vast_search_query, | ||
image=Settings.template_image, | ||
blacklist_host_ids=blacklist.list() if blacklist else [], | ||
) | ||
|
||
# wait for instance become online | ||
wait_start_time = 0 | ||
while True: | ||
instance = vast.getInstanceByLabel(instance_label) | ||
if not instance: | ||
raise Exception("Instance not found") | ||
if not wait_start_time: | ||
wait_start_time = blacklist.getInstanceStartTime(instance.id) if blacklist else int(time.time()) | ||
logging.info(instance) | ||
if instance and instance.status == "running" and instance.port > 0: | ||
if blacklist: | ||
blacklist.delInstanceStartTime(instance.id) | ||
break | ||
logging.info("Waiting for instance become online ...") | ||
wait_time = int(time.time()) - wait_start_time | ||
if blacklist and wait_time > Settings.blacklist_ban_after_seconds: | ||
logging.info("Too long waiting, instance host will be banned for some time...") | ||
blacklist.add(instance.hostId) | ||
instance.destroy() | ||
raise Exception(f"Host {instance.hostId} marked as banned due to long startup waiting") | ||
logging.info(f"Waiting for instance become online ({wait_time} seconds) ...") | ||
time.sleep(10) | ||
logging.info("Instance is ready to accept connections") | ||
|
||
logging.info("Instance is ready to accept connections") | ||
|
||
# generate nginx config | ||
environment = Environment(loader=FileSystemLoader("./"), autoescape=True) | ||
nginx_template = environment.get_template("nginx.conf.j2") | ||
nginx_conf = nginx_template.render({"ip": instance.ip, "port": instance.port, "listen_port": nginx_listen_port, "max_body_size": nginx_max_body_size}) | ||
nginx_conf = nginx_template.render({"ip": instance.ip, "port": instance.port, "listen_port": Settings.nginx_listen_port, "max_body_size": Settings.nginx_max_body_size}) | ||
logging.info(f"Final nginx config:\n{nginx_conf}") | ||
with open(nginx_config_path, "w") as nginx_conf_file: | ||
with open(Settings.nginx_config_path, "w") as nginx_conf_file: | ||
nginx_conf_file.write(nginx_conf) | ||
logging.info("Nginx config saved") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import redis | ||
import logging | ||
import time | ||
|
||
|
||
class Blacklist: | ||
_redis_conn: redis.Redis | ||
_ban_ttl: int | ||
|
||
def __init__(self, redis_url: str, ban_ttl: int = 3600, restarts_ttl: int = 3600): | ||
""" | ||
Initializes the Blacklist class. | ||
:param redis_url: URL of the Redis instance. | ||
:param db: Redis database index to use. | ||
:param ban_ttl: Time-to-live (TTL) for each ban record in seconds. | ||
""" | ||
self._ban_ttl = ban_ttl | ||
self._restarts_record_ttl = restarts_ttl | ||
self._redis_conn = redis.StrictRedis.from_url(url=redis_url) | ||
self._logger = logging.getLogger("blacklist") | ||
self._logger.info(f"initialized with ttl={self._ban_ttl}") | ||
|
||
_ban_key_prefix: str = "ban_" | ||
|
||
def _getBanKey(self, key: str) -> str: | ||
return f"{self._ban_key_prefix}{key}" | ||
|
||
def isBanned(self, key: str) -> bool: | ||
""" | ||
Checks if the given key is in the blacklist. | ||
:param key: The string to check. | ||
:return: True if the key is banned, False otherwise. | ||
""" | ||
return self._redis_conn.exists(self._getBanKey(key)) == 1 | ||
|
||
def add(self, key: str, reason: str = "slow_startup"): | ||
""" | ||
Adds the given key to the blacklist with the specified TTL. | ||
:param key: The string to ban. | ||
""" | ||
self._redis_conn.setex(self._getBanKey(key), self._ban_ttl, reason) | ||
self._logger.info(f"added '{key}' ttl={self._ban_ttl}") | ||
|
||
def list(self) -> list[str]: | ||
""" | ||
Retrieves all currently banned keys (IDs) from the blacklist. | ||
:return: A list of currently banned keys. | ||
""" | ||
banned_keys = [] | ||
cursor, keys = self._redis_conn.scan(match=f"{self._ban_key_prefix}*", count=100) | ||
banned_keys.extend(keys) | ||
while cursor != 0: | ||
cursor, keys = self._redis_conn.scan(cursor=cursor, match=f"{self._ban_key_prefix}*", count=100) | ||
banned_keys.extend(keys) | ||
key_len = len(self._ban_key_prefix) | ||
return [key.decode("utf-8")[key_len:] for key in banned_keys] | ||
|
||
_wait_time_key_prefix: str = "wait_" | ||
_wait_time_record_ttl: int = 24 * 60 * 60 # 1day | ||
|
||
def getInstanceStartTime(self, instance_id: str) -> int: | ||
""" | ||
Returns the time difference (in seconds) between the current time and the stored timestamp for the given instance. | ||
If no timestamp is found, it sets the current time as the start time. | ||
:param instance_id: Vast instance id. | ||
:return: Time difference in seconds. | ||
""" | ||
key = f"{self._wait_time_key_prefix}{instance_id}" | ||
start_time_str = self._redis_conn.get(key) | ||
|
||
if start_time_str is None: | ||
start_time = int(time.time()) | ||
self._redis_conn.setex(key, self._wait_time_record_ttl, str(start_time)) | ||
else: | ||
start_time = int(start_time_str) | ||
|
||
self._logger.info(f"got '{instance_id}' start time - {start_time}") | ||
return start_time | ||
|
||
def delInstanceStartTime(self, instance_id: str): | ||
""" | ||
Deletes the stored wait time for the given instance. | ||
:param instance_id: Vast instance id. | ||
""" | ||
self._redis_conn.delete(f"{self._wait_time_key_prefix}{instance_id}") | ||
self._logger.info(f"deleted '{instance_id}' wait time") | ||
|
||
_restarts_key_prefix: str = "restarts_" | ||
_restarts_record_ttl: int = 60 * 60 # 1hour | ||
|
||
def getAndIncreaseInstanceRestarts(self, instance_id: str) -> int: | ||
key = f"{self._restarts_key_prefix}{instance_id}" | ||
restarts = self._redis_conn.get(key) | ||
restarts = int(restarts) if restarts else 0 | ||
restarts += 1 | ||
self._redis_conn.setex(key, self._restarts_record_ttl, restarts) | ||
self._logger.info(f"got '{instance_id}' restarts counter - {restarts}") | ||
return restarts | ||
|
||
def cleanInstanceKeys(self, instance_id: str): | ||
self._redis_conn.delete(f"{self._restarts_key_prefix}{instance_id}") | ||
self._redis_conn.delete(f"{self._wait_time_key_prefix}{instance_id}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,3 +8,4 @@ setuptools==72.1.0 | |
urllib3==2.2.2 | ||
vastai==0.2.5 | ||
wheel==0.44.0 | ||
redis==5.0.8 |
Oops, something went wrong.