Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Graceful handling and notification of errors #191

Merged
merged 10 commits into from
Aug 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions main_2024.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,13 @@ def main() -> int:
manager.start_workers()

while True:
for manager in worker_managers:
result = manager.check_and_restart_dead_workers()
if not result:
frame = inspect.currentframe()
main_logger.error("Failed to restart workers", frame)
return -1

try:
geolocation_data = geolocation_to_main_queue.queue.get_nowait()
except queue.Empty:
Expand Down
2 changes: 1 addition & 1 deletion modules/common
63 changes: 63 additions & 0 deletions utilities/workers/worker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,18 @@ def get_worker_target(self) -> "(...) -> object": # type: ignore
"""
return self.__target

def get_input_queues(self) -> "list[queue_proxy_wrapper.QueueProxyWrapper]":
"""
Returns the input queues.
"""
return self.__input_queues

def get_target_name(self) -> str:
"""
Returns the name of the target.
"""
return self.__target.__name__


class WorkerManager:
"""
Expand Down Expand Up @@ -146,19 +158,25 @@ def create(
return True, WorkerManager(
cls.__create_key,
workers,
worker_properties,
local_logger,
)

def __init__(
self,
class_private_create_key: object,
workers: "list[mp.Process]",
worker_properties: WorkerProperties,
local_logger: logger.Logger,
) -> None:
"""
Private constructor, use create() method.
"""
assert class_private_create_key is WorkerManager.__create_key, "Use create() method"

self.__workers = workers
self.__worker_properties = worker_properties
self.__local_logger = local_logger

@staticmethod
def __create_single_worker(target: "(...) -> object", args: "tuple", local_logger: logger.Logger) -> "tuple[bool, mp.Process | None]": # type: ignore
Expand Down Expand Up @@ -195,3 +213,48 @@ def join_workers(self) -> None:
"""
for worker in self.__workers:
worker.join()

def check_and_restart_dead_workers(self) -> bool:
"""
Check and restart dead workers.

Returns whether the dead workers were able to be restarted.
"""
new_workers = []
for worker in self.__workers:
if worker.is_alive():
new_workers.append(worker)
continue

# Log dead worker
frame = inspect.currentframe()
target_and_worker_name = f"{self.__worker_properties.get_target_name()} {worker.name}"
self.__local_logger.warning(
f"Worker died, restarting {target_and_worker_name}",
frame,
)

# Create a new worker
result, new_worker = WorkerManager.__create_single_worker(
self.__worker_properties.get_worker_target(),
self.__worker_properties.get_worker_arguments(),
self.__local_logger,
)
if not result:
frame = inspect.currentframe()
self.__local_logger.error(f"Failed to restart {target_and_worker_name}", frame)
return False

# Append the new worker
new_workers.append(new_worker)

self.__workers = new_workers

# Draining the preceding queue ensures that the preceding queue data wasn't what
# caused the worker to fail. Draining the succeeding queues is not needed
# because a worker that died would not have put bad data into the queue.
input_queues = self.__worker_properties.get_input_queues()
for queue in input_queues:
queue.drain_queue()

return True
Loading