From 1c37ad5dc95f2364d43b96ac9be852f2c72079c9 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 28 Mar 2023 11:00:45 +0200 Subject: [PATCH 01/34] basics of the procesor server --- ocrd/ocrd/cli/__init__.py | 2 + ocrd/ocrd/cli/processor_server.py | 48 ++++++ ocrd_network/ocrd_network/__init__.py | 1 + ocrd_network/ocrd_network/deployer.py | 6 + ocrd_network/ocrd_network/processor_server.py | 151 ++++++++++++++++++ 5 files changed, 208 insertions(+) create mode 100644 ocrd/ocrd/cli/processor_server.py create mode 100644 ocrd_network/ocrd_network/processor_server.py diff --git a/ocrd/ocrd/cli/__init__.py b/ocrd/ocrd/cli/__init__.py index d645daddf3..5d00706404 100644 --- a/ocrd/ocrd/cli/__init__.py +++ b/ocrd/ocrd/cli/__init__.py @@ -33,6 +33,7 @@ def get_help(self, ctx): from .log import log_cli from .processing_server import processing_server_cli from .processing_worker import processing_worker_cli +from .processor_server import processor_server_cli @click.group() @@ -53,3 +54,4 @@ def cli(**kwargs): # pylint: disable=unused-argument cli.add_command(resmgr_cli) cli.add_command(processing_server_cli) cli.add_command(processing_worker_cli) +cli.add_command(processor_server_cli) diff --git a/ocrd/ocrd/cli/processor_server.py b/ocrd/ocrd/cli/processor_server.py new file mode 100644 index 0000000000..b8a812b1e5 --- /dev/null +++ b/ocrd/ocrd/cli/processor_server.py @@ -0,0 +1,48 @@ +""" +OCR-D CLI: start the processor server + +.. click:: ocrd.cli.processor_server:processor_server_cli + :prog: ocrd processor-server + :nested: full +""" +import click +import logging +from ocrd_utils import initLogging +from ocrd_network import ( + DatabaseParamType, + ProcessingServerParamType, + ProcessorServer, +) + + +@click.command('processor-server') +@click.argument('processor_name', required=True, type=click.STRING) +@click.option('-d', '--address', + help='The URL of the processor server, format: host:port', + type=ProcessingServerParamType(), + required=True) +@click.option('-d', '--database', + default="mongodb://localhost:27018", + help='The URL of the MongoDB, format: mongodb://host:port', + type=DatabaseParamType()) +def processor_server_cli(processor_name: str, address: str, database: str): + """ + Start ocr-d processor as a server + """ + initLogging() + # TODO: Remove before the release + logging.getLogger('ocrd.network').setLevel(logging.DEBUG) + + # Note, the address is already validated with the type field + host, port = address.split(':') + + try: + processor_server = ProcessorServer( + processor_name=processor_name, + mongodb_addr=database, + processor_class=None, # For readability purposes assigned here + ) + processor_server.run_server(host=host, port=port, access_log=False) + + except Exception as e: + raise Exception("Processor server has failed with error") from e diff --git a/ocrd_network/ocrd_network/__init__.py b/ocrd_network/ocrd_network/__init__.py index 6cd95dc3cf..e751ecbefb 100644 --- a/ocrd_network/ocrd_network/__init__.py +++ b/ocrd_network/ocrd_network/__init__.py @@ -24,6 +24,7 @@ # the network package. The reason, Mets Server is tightly coupled with the `OcrdWorkspace`. from .processing_server import ProcessingServer from .processing_worker import ProcessingWorker +from .processor_server import ProcessorServer from .param_validators import ( DatabaseParamType, ProcessingServerParamType, diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index 27ac323fad..75d7852bcf 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -77,6 +77,8 @@ def deploy_hosts(self, rabbitmq_url: str, mongodb_url: str) -> None: host.config.keypath ) + # TODO: Call the _deploy_processor_server() here and adapt accordingly + for processor in host.config.processors: self._deploy_processing_worker(processor, host, rabbitmq_url, mongodb_url) @@ -112,6 +114,10 @@ def _deploy_processing_worker(self, processor: WorkerConfig, host: HostData, host.pids_docker.append(pid) sleep(0.1) + def _deploy_processor_server(self, mongodb_url: str) -> None: + # TODO: Method for deploying a processor server + pass + def deploy_rabbitmq(self, image: str, detach: bool, remove: bool, ports_mapping: Union[Dict, None] = None) -> str: """Start docker-container with rabbitmq diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py new file mode 100644 index 0000000000..7b83259305 --- /dev/null +++ b/ocrd_network/ocrd_network/processor_server.py @@ -0,0 +1,151 @@ +from contextlib import redirect_stdout +from io import StringIO +from subprocess import run, PIPE +import uvicorn + +from fastapi import FastAPI, HTTPException, status, BackgroundTasks + +from ocrd import Resolver +# from ocrd.processor.helpers import run_processor_from_api, run_cli_from_api +from ocrd_validators import ParameterValidator +from ocrd_utils import ( + get_ocrd_tool_json, + parse_json_string_with_comments, + set_json_key_value_overrides +) + +from .database import ( + DBProcessorJob, + db_get_processing_job, + initiate_database +) +from .models import ( + PYJobInput, + PYJobOutput, + PYOcrdTool, + StateEnum +) + + +class ProcessorServer(FastAPI): + + def __init__(self, processor_name: str, mongodb_addr: str, processor_class=None): + self.processor_name = processor_name + self.db_url = mongodb_addr + self.ProcessorClass = processor_class + self.ocrd_tool = None + self.version = None + + self.version = self.get_version() + self.ocrd_tool = self.get_ocrd_tool() + + if not self.ocrd_tool: + raise Exception(f"The ocrd_tool is empty or missing") + + tags_metadata = [ + { + 'name': 'Processing', + 'description': 'OCR-D Processor Server' + } + ] + + super().__init__( + title=self.ocrd_tool['executable'], + description=self.ocrd_tool['description'], + version=self.version, + openapi_tags=tags_metadata, + on_startup=[self.startup] + ) + + # Create routes + self.router.add_api_route( + path='/', + endpoint=self.get_processor_info, + methods=['GET'], + tags=['Processing'], + status_code=status.HTTP_200_OK, + summary='Get information about this processor.', + response_model=PYOcrdTool, + response_model_exclude_unset=True, + response_model_exclude_none=True + ) + + self.router.add_api_route( + path='/', + endpoint=self.process, + methods=['POST'], + tags=['Processing'], + status_code=status.HTTP_202_ACCEPTED, + summary='Submit a job to this processor.', + response_model=PYJobOutput, + response_model_exclude_unset=True, + response_model_exclude_none=True + ) + + self.router.add_api_route( + path='/{job_id}', + endpoint=self.get_job, + methods=['GET'], + tags=['Processing'], + status_code=status.HTTP_200_OK, + summary='Get information about a job based on its ID', + response_model=PYJobOutput, + response_model_exclude_unset=True, + response_model_exclude_none=True + ) + + async def startup(self): + await initiate_database(db_url=self.db_url) + DBProcessorJob.Settings.name = self.processor_name + + async def get_processor_info(self): + return self.ocrd_tool + + async def process(self, data: PYJobInput, background_tasks: BackgroundTasks): + # TODO: Adapt from #884 + pass + + async def get_job(self, processor_name: str, job_id: str) -> PYJobOutput: + """ Return processing job-information from the database + """ + try: + job = await db_get_processing_job(job_id) + return job.to_job_output() + except ValueError: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Processing job with id '{job_id}' of processor type '{processor_name}' not existing" + ) + + def get_ocrd_tool(self): + if self.ocrd_tool: + return self.ocrd_tool + if self.ProcessorClass: + str_out = StringIO() + with redirect_stdout(str_out): + self.ProcessorClass(workspace=None, dump_json=True) + ocrd_tool = parse_json_string_with_comments(str_out.getvalue()) + else: + ocrd_tool = get_ocrd_tool_json(self.processor_name) + return ocrd_tool + + def get_version(self) -> str: + if self.version: + return self.version + if self.ProcessorClass: + str_out = StringIO() + with redirect_stdout(str_out): + self.ProcessorClass(workspace=None, show_version=True) + version_str = str_out.getvalue() + else: + version_str = run( + [self.processor_name, '--version'], + stdout=PIPE, + check=True, + universal_newlines=True + ).stdout + # the version string is in format: Version %s, ocrd/core %s + return version_str + + def run_server(self, host, port): + uvicorn.run(self, host=host, port=port, access_log=False) From 42f16ab5f0e14b71cadd6ec8f2f37616f8adc5c1 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 28 Mar 2023 12:08:44 +0200 Subject: [PATCH 02/34] Adapt the fix from #974 --- ocrd_network/ocrd_network/processing_server.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 988480ee3f..3e01c63e14 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -228,18 +228,17 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ if processor_name not in self.processor_list: try: - # Only checks if the process queue exists, if not raises ValueError + # Only checks if the process queue exists, if not raises ChannelClosedByBroker self.rmq_publisher.create_queue(processor_name, passive=True) except ChannelClosedByBroker as error: self.log.warning(f"Process queue with id '{processor_name}' not existing: {error}") + # Reconnect publisher - not efficient, but works + # TODO: Revisit when reconnection strategy is implemented + self.connect_publisher(enable_acks=True) raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=f"Process queue with id '{processor_name}' not existing" ) - finally: - # Reconnect publisher - not efficient, but works - # TODO: Revisit when reconnection strategy is implemented - self.connect_publisher(enable_acks=True) # validate additional parameters if data.parameters: From ba92c877f06e671e5adf845ee2d8d178bc2bae3e Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 28 Mar 2023 12:27:25 +0200 Subject: [PATCH 03/34] check defaults, pass shallow copy --- .../ocrd_network/processing_server.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 3e01c63e14..582c1134cc 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -240,17 +240,16 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ detail=f"Process queue with id '{processor_name}' not existing" ) - # validate additional parameters - if data.parameters: - ocrd_tool = get_ocrd_tool_json(processor_name) - if not ocrd_tool: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Processor '{processor_name}' not available. Empty or missing ocrd_tool" - ) - report = ParameterValidator(ocrd_tool).validate(data.parameters) - if not report.is_valid: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) + # validate parameters + ocrd_tool = get_ocrd_tool_json(processor_name) + if not ocrd_tool: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Processor '{processor_name}' not available. Empty or missing ocrd_tool" + ) + report = ParameterValidator(ocrd_tool).validate(dict(data.parameters)) + if not report.is_valid: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) if bool(data.path_to_mets) == bool(data.workspace_id): raise HTTPException( From 0644b9e187f311fccbfadd3598732d68ab4e32dc Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 28 Mar 2023 17:38:22 +0200 Subject: [PATCH 04/34] complete processor server internals --- ocrd/ocrd/cli/processor_server.py | 2 +- ocrd_network/ocrd_network/deployer.py | 8 +- ocrd_network/ocrd_network/processor_server.py | 218 +++++++++++++++++- 3 files changed, 215 insertions(+), 13 deletions(-) diff --git a/ocrd/ocrd/cli/processor_server.py b/ocrd/ocrd/cli/processor_server.py index b8a812b1e5..2665fdc972 100644 --- a/ocrd/ocrd/cli/processor_server.py +++ b/ocrd/ocrd/cli/processor_server.py @@ -38,8 +38,8 @@ def processor_server_cli(processor_name: str, address: str, database: str): try: processor_server = ProcessorServer( - processor_name=processor_name, mongodb_addr=database, + processor_name=processor_name, processor_class=None, # For readability purposes assigned here ) processor_server.run_server(host=host, port=port, access_log=False) diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index 75d7852bcf..16b89c66b8 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -90,7 +90,7 @@ def deploy_hosts(self, rabbitmq_url: str, mongodb_url: str) -> None: def _deploy_processing_worker(self, processor: WorkerConfig, host: HostData, rabbitmq_url: str, mongodb_url: str) -> None: - self.log.debug(f"deploy '{processor.deploy_type}' processor: '{processor}' on '{host.config.address}'") + self.log.debug(f"deploy '{processor.deploy_type}' worker: '{processor}' on '{host.config.address}'") for _ in range(processor.count): if processor.deploy_type == DeployType.native: @@ -114,7 +114,11 @@ def _deploy_processing_worker(self, processor: WorkerConfig, host: HostData, host.pids_docker.append(pid) sleep(0.1) - def _deploy_processor_server(self, mongodb_url: str) -> None: + def _deploy_processor_server(self, processor: WorkerConfig, host: HostData, mongodb_url: str) -> None: + self.log.debug(f"deploy '{processor.deploy_type}' processor server: '{processor}' on '{host.config.address}'") + + + # TODO: Method for deploying a processor server pass diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 7b83259305..03f6cadb7d 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -1,22 +1,29 @@ from contextlib import redirect_stdout +from datetime import datetime from io import StringIO +import json +import logging +from os import environ, getpid from subprocess import run, PIPE +from typing import List import uvicorn from fastapi import FastAPI, HTTPException, status, BackgroundTasks +from ocrd.processor.helpers import run_cli, run_processor from ocrd import Resolver -# from ocrd.processor.helpers import run_processor_from_api, run_cli_from_api from ocrd_validators import ParameterValidator from ocrd_utils import ( + getLogger, get_ocrd_tool_json, - parse_json_string_with_comments, - set_json_key_value_overrides + parse_json_string_with_comments ) from .database import ( DBProcessorJob, db_get_processing_job, + db_get_workspace, + db_update_processing_job, initiate_database ) from .models import ( @@ -25,13 +32,36 @@ PYOcrdTool, StateEnum ) +from .utils import calculate_execution_time, generate_id + +# TODO: Check this again when the logging is refactored +try: + # This env variable must be set before importing from Keras + environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + from tensorflow.keras.utils import disable_interactive_logging + # Enabled interactive logging throws an exception + # due to a call of sys.stdout.flush() + disable_interactive_logging() +except Exception: + # Nothing should be handled here if TF is not available + pass class ProcessorServer(FastAPI): + def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class=None): + if not (processor_name or processor_class): + raise ValueError('Either "processor_name" or "processor_class" must be provided') + + self.log = getLogger(__name__) + # TODO: Provide more flexibility for configuring file logging (i.e. via ENV variables) + file_handler = logging.FileHandler(f'/tmp/server_{processor_name}_{getpid()}.log', mode='a') + logging_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + file_handler.setFormatter(logging.Formatter(logging_format)) + file_handler.setLevel(logging.DEBUG) + self.log.addHandler(file_handler) - def __init__(self, processor_name: str, mongodb_addr: str, processor_class=None): - self.processor_name = processor_name self.db_url = mongodb_addr + self.processor_name = processor_name self.ProcessorClass = processor_class self.ocrd_tool = None self.version = None @@ -42,6 +72,9 @@ def __init__(self, processor_name: str, mongodb_addr: str, processor_class=None) if not self.ocrd_tool: raise Exception(f"The ocrd_tool is empty or missing") + if not self.processor_name: + self.processor_name = self.ocrd_tool['executable'] + tags_metadata = [ { 'name': 'Processing', @@ -50,7 +83,7 @@ def __init__(self, processor_name: str, mongodb_addr: str, processor_class=None) ] super().__init__( - title=self.ocrd_tool['executable'], + title=self.processor_name, description=self.ocrd_tool['description'], version=self.version, openapi_tags=tags_metadata, @@ -99,11 +132,73 @@ async def startup(self): DBProcessorJob.Settings.name = self.processor_name async def get_processor_info(self): + if not self.ocrd_tool: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f'Empty or missing ocrd_tool' + ) return self.ocrd_tool - async def process(self, data: PYJobInput, background_tasks: BackgroundTasks): - # TODO: Adapt from #884 - pass + # Note: The Processing server pushes to a queue, while + # the Processor Server creates (pushes to) a background task + async def push_processor_job(self, data: PYJobInput, background_tasks: BackgroundTasks): + if not self.ocrd_tool: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f'Empty or missing ocrd_tool' + ) + report = ParameterValidator(self.ocrd_tool).validate(dict(data.parameters)) + if not report.is_valid: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) + + if bool(data.path_to_mets) == bool(data.workspace_id): + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail="Either 'path' or 'workspace_id' must be provided, but not both" + ) + + # This check is done to return early in case + # the workspace_id is provided but not existing in the DB + elif data.workspace_id: + try: + await db_get_workspace(data.workspace_id) + except ValueError: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Workspace with id '{data.workspace_id}' not existing" + ) + + job = DBProcessorJob( + **data.dict(exclude_unset=True, exclude_none=True), + job_id=generate_id(), + processor_name=self.processor_name, + state=StateEnum.queued + ) + await job.insert() + + if self.ProcessorClass: + # Run the processor in the background + background_tasks.add_task( + self.run_processor_from_server, + job_id=job.id, + workspace_id=data.workspace_id, + page_id=data.page_id, + parameter=data.parameters, + input_file_grps=data.input_file_grps, + output_file_grps=data.output_file_grps, + ) + else: + # Run the CLI in the background + background_tasks.add_task( + self.run_cli_from_server, + job_id=job.id, + workspace_id=data.workspace_id, + page_id=data.page_id, + input_file_grps=data.input_file_grps, + output_file_grps=data.output_file_grps, + parameter=data.parameters + ) + return job.to_job_output() async def get_job(self, processor_name: str, job_id: str) -> PYJobOutput: """ Return processing job-information from the database @@ -144,8 +239,111 @@ def get_version(self) -> str: check=True, universal_newlines=True ).stdout - # the version string is in format: Version %s, ocrd/core %s return version_str def run_server(self, host, port): uvicorn.run(self, host=host, port=port, access_log=False) + + async def run_cli_from_server( + self, + job_id: str, + processor_name: str, + workspace_id: str, + input_file_grps: List[str], + output_file_grps: List[str], + page_id: str, + parameters: dict + ): + log = getLogger('ocrd.processor.helpers.run_cli_from_api') + + # Turn input/output file groups into a comma separated string + input_file_grps_str = ','.join(input_file_grps) + output_file_grps_str = ','.join(output_file_grps) + + workspace_db = await db_get_workspace(workspace_id) + path_to_mets = workspace_db.workspace_mets_path + workspace = Resolver().workspace_from_url(path_to_mets) + + start_time = datetime.now() + await db_update_processing_job( + job_id=job_id, + state=StateEnum.running, + start_time=start_time + ) + # Execute the processor + return_code = run_cli( + executable=processor_name, + workspace=workspace, + page_id=page_id, + input_file_grp=input_file_grps_str, + output_file_grp=output_file_grps_str, + parameter=json.dumps(parameters), + mets_url=workspace.mets_target + ) + end_time = datetime.now() + # Execution duration in ms + execution_duration = calculate_execution_time(start_time, end_time) + + if return_code != 0: + job_state = StateEnum.failed + log.error(f'{self.processor_name} exited with non-zero return value {return_code}.') + else: + job_state = StateEnum.success + + await db_update_processing_job( + job_id=job_id, + state=job_state, + end_time=end_time, + exec_time=f'{execution_duration} ms' + ) + + async def run_processor_from_server( + self, + job_id: str, + workspace_id: str, + input_file_grps: List[str], + output_file_grps: List[str], + page_id: str, + parameters: dict, + ): + log = getLogger('ocrd.processor.helpers.run_processor_from_api') + + # Turn input/output file groups into a comma separated string + input_file_grps_str = ','.join(input_file_grps) + output_file_grps_str = ','.join(output_file_grps) + + workspace_db = await db_get_workspace(workspace_id) + path_to_mets = workspace_db.workspace_mets_path + workspace = Resolver().workspace_from_url(path_to_mets) + + is_success = True + start_time = datetime.now() + await db_update_processing_job( + job_id=job_id, + state=StateEnum.running, + start_time=start_time + ) + try: + run_processor( + processorClass=self.ProcessorClass, + workspace=workspace, + page_id=page_id, + parameter=parameters, + input_file_grp=input_file_grps_str, + output_file_grp=output_file_grps_str, + instance_caching=True + ) + except Exception as e: + is_success = False + log.exception(e) + + end_time = datetime.now() + # Execution duration in ms + execution_duration = calculate_execution_time(start_time, end_time) + job_state = StateEnum.success if is_success else StateEnum.failed + await db_update_processing_job( + job_id=job_id, + state=job_state, + end_time=end_time, + exec_time=f'{execution_duration} ms' + ) From cdb73d5b53620fad609b2960a5c78f59e369c2be Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 29 Mar 2023 01:13:15 +0200 Subject: [PATCH 05/34] implement most of the logic for everything --- ocrd/ocrd/cli/processing_worker.py | 13 +- ocrd/ocrd/cli/processor_server.py | 20 +- ocrd/ocrd/decorators/__init__.py | 215 ++++++++++-------- ocrd/ocrd/decorators/ocrd_cli_options.py | 4 +- ocrd_network/ocrd_network/deployer.py | 134 ++++++++--- .../ocrd_network/deployment_config.py | 16 ++ ocrd_network/ocrd_network/deployment_utils.py | 9 +- ocrd_network/ocrd_network/models/job.py | 3 + .../ocrd_network/processing_server.py | 166 +++++++++++--- .../ocrd_network/processing_worker.py | 3 +- ocrd_network/ocrd_network/processor_server.py | 38 ++-- .../processing_server_config.schema.yml | 34 ++- 12 files changed, 467 insertions(+), 188 deletions(-) diff --git a/ocrd/ocrd/cli/processing_worker.py b/ocrd/ocrd/cli/processing_worker.py index e9311e061f..3e568b268a 100644 --- a/ocrd/ocrd/cli/processing_worker.py +++ b/ocrd/ocrd/cli/processing_worker.py @@ -20,15 +20,22 @@ @click.command('processing-worker') @click.argument('processor_name', required=True, type=click.STRING) +@click.option('--agent_type', + help='The type of this network agent', + default="worker", + type=click.STRING, + required=True) @click.option('-q', '--queue', default="amqp://admin:admin@localhost:5672/", help='The URL of the Queue Server, format: amqp://username:password@host:port/vhost', - type=QueueServerParamType()) + type=QueueServerParamType(), + required=True) @click.option('-d', '--database', default="mongodb://localhost:27018", help='The URL of the MongoDB, format: mongodb://host:port', - type=DatabaseParamType()) -def processing_worker_cli(processor_name: str, queue: str, database: str): + type=DatabaseParamType(), + required=True) +def processing_worker_cli(processor_name: str, agent_type: str, queue: str, database: str): """ Start a processing worker (a specific ocr-d processor) """ diff --git a/ocrd/ocrd/cli/processor_server.py b/ocrd/ocrd/cli/processor_server.py index 2665fdc972..cbff6a8f91 100644 --- a/ocrd/ocrd/cli/processor_server.py +++ b/ocrd/ocrd/cli/processor_server.py @@ -17,15 +17,21 @@ @click.command('processor-server') @click.argument('processor_name', required=True, type=click.STRING) -@click.option('-d', '--address', +@click.option('--agent_type', + help='The type of this network agent', + default="server", + type=click.STRING, + required=True) +@click.option('--agent_address', help='The URL of the processor server, format: host:port', type=ProcessingServerParamType(), required=True) @click.option('-d', '--database', default="mongodb://localhost:27018", help='The URL of the MongoDB, format: mongodb://host:port', - type=DatabaseParamType()) -def processor_server_cli(processor_name: str, address: str, database: str): + type=DatabaseParamType(), + required=True) +def processor_server_cli(processor_name: str, agent_type: str, agent_address: str, database: str): """ Start ocr-d processor as a server """ @@ -33,16 +39,14 @@ def processor_server_cli(processor_name: str, address: str, database: str): # TODO: Remove before the release logging.getLogger('ocrd.network').setLevel(logging.DEBUG) - # Note, the address is already validated with the type field - host, port = address.split(':') - try: + # TODO: Better validate that inside the ProcessorServer itself + host, port = agent_address.split(':') processor_server = ProcessorServer( mongodb_addr=database, processor_name=processor_name, processor_class=None, # For readability purposes assigned here ) - processor_server.run_server(host=host, port=port, access_log=False) - + processor_server.run_server(host=host, port=int(port)) except Exception as e: raise Exception("Processor server has failed with error") from e diff --git a/ocrd/ocrd/decorators/__init__.py b/ocrd/ocrd/decorators/__init__.py index 2cffe12fe3..1aaeedaf19 100644 --- a/ocrd/ocrd/decorators/__init__.py +++ b/ocrd/ocrd/decorators/__init__.py @@ -15,7 +15,7 @@ from ocrd_utils import getLogger, initLogging, parse_json_string_with_comments from ocrd_validators import WorkspaceValidator -from ocrd_network import ProcessingWorker +from ocrd_network import ProcessingWorker, ProcessorServer from ..resolver import Resolver from ..processor.base import run_processor @@ -38,8 +38,12 @@ def ocrd_cli_wrap_processor( overwrite=False, show_resource=None, list_resources=False, + # ocrd_network params start # + agent_type=None, + agent_address=None, queue=None, database=None, + # ocrd_network params end # **kwargs ): if not sys.argv[1:]: @@ -56,96 +60,123 @@ def ocrd_cli_wrap_processor( list_resources=list_resources ) sys.exit() - # If either of these two is provided but not both - if bool(queue) != bool(database): - raise Exception("Options --queue and --database require each other.") - # If both of these are provided - start the processing worker instead of the processor - processorClass - if queue and database: - initLogging() - # TODO: Remove before the release - # We are importing the logging here because it's not the ocrd logging but python one - import logging - logging.getLogger('ocrd.network').setLevel(logging.DEBUG) - # Get the ocrd_tool dictionary - processor = processorClass(workspace=None, dump_json=True) - ocrd_tool = processor.ocrd_tool + # Used for checking/starting network agents for the WebAPI architecture + # Has no side effects if neither of the 4 ocrd_network parameters are passed + check_and_run_network_agent(processorClass, agent_type, agent_address, database, queue) - try: - processing_worker = ProcessingWorker( - rabbitmq_addr=queue, - mongodb_addr=database, - processor_name=ocrd_tool['executable'], - ocrd_tool=ocrd_tool, - processor_class=processorClass, - ) - # The RMQConsumer is initialized and a connection to the RabbitMQ is performed - processing_worker.connect_consumer() - # Start consuming from the queue with name `processor_name` - processing_worker.start_consuming() - except Exception as e: - raise Exception("Processing worker has failed with error") from e - else: - initLogging() - LOG = getLogger('ocrd_cli_wrap_processor') - # LOG.info('kwargs=%s' % kwargs) - # Merge parameter overrides and parameters - if 'parameter_override' in kwargs: - set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) - # TODO OCR-D/core#274 - # Assert -I / -O - # if not kwargs['input_file_grp']: - # raise ValueError('-I/--input-file-grp is required') - # if not kwargs['output_file_grp']: - # raise ValueError('-O/--output-file-grp is required') - resolver = Resolver() - working_dir, mets, _ = resolver.resolve_mets_arguments(working_dir, mets, None) - workspace = resolver.workspace_from_url(mets, working_dir) - page_id = kwargs.get('page_id') - # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 - # if overwrite - # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: - # raise Exception("--overwrite requires --output-file-grp") - # LOG.info("Removing files because of --overwrite") - # for grp in kwargs['output_file_grp'].split(','): - # if page_id: - # for one_page_id in kwargs['page_id'].split(','): - # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) - # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): - # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) - # else: - # LOG.debug("Removing all files in output file group %s ", grp) - # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) - # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) - # workspace.save_mets() - # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace - if overwrite: - workspace.overwrite_mode = True - report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) - if not report.is_valid: - raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) - # Set up profiling behavior from environment variables/flags - if not profile and 'OCRD_PROFILE' in environ: - if 'CPU' in environ['OCRD_PROFILE']: - profile = True - if not profile_file and 'OCRD_PROFILE_FILE' in environ: - profile_file = environ['OCRD_PROFILE_FILE'] - if profile or profile_file: - import cProfile - import pstats - import io - import atexit - print("Profiling...") - pr = cProfile.Profile() - pr.enable() - def exit(): - pr.disable() - print("Profiling completed") - if profile_file: - with open(profile_file, 'wb') as f: - pr.dump_stats(profile_file) - s = io.StringIO() - pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() - print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + initLogging() + LOG = getLogger('ocrd_cli_wrap_processor') + # LOG.info('kwargs=%s' % kwargs) + # Merge parameter overrides and parameters + if 'parameter_override' in kwargs: + set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) + # TODO OCR-D/core#274 + # Assert -I / -O + # if not kwargs['input_file_grp']: + # raise ValueError('-I/--input-file-grp is required') + # if not kwargs['output_file_grp']: + # raise ValueError('-O/--output-file-grp is required') + resolver = Resolver() + working_dir, mets, _ = resolver.resolve_mets_arguments(working_dir, mets, None) + workspace = resolver.workspace_from_url(mets, working_dir) + page_id = kwargs.get('page_id') + # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 + # if overwrite + # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: + # raise Exception("--overwrite requires --output-file-grp") + # LOG.info("Removing files because of --overwrite") + # for grp in kwargs['output_file_grp'].split(','): + # if page_id: + # for one_page_id in kwargs['page_id'].split(','): + # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) + # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): + # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) + # else: + # LOG.debug("Removing all files in output file group %s ", grp) + # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) + # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) + # workspace.save_mets() + # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace + if overwrite: + workspace.overwrite_mode = True + report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) + if not report.is_valid: + raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) + # Set up profiling behavior from environment variables/flags + if not profile and 'OCRD_PROFILE' in environ: + if 'CPU' in environ['OCRD_PROFILE']: + profile = True + if not profile_file and 'OCRD_PROFILE_FILE' in environ: + profile_file = environ['OCRD_PROFILE_FILE'] + if profile or profile_file: + import cProfile + import pstats + import io + import atexit + print("Profiling...") + pr = cProfile.Profile() + pr.enable() + def exit(): + pr.disable() + print("Profiling completed") + if profile_file: + with open(profile_file, 'wb') as f: + pr.dump_stats(profile_file) + s = io.StringIO() + pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() + print(s.getvalue()) + atexit.register(exit) + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + + +def check_and_run_network_agent(ProcessorClass, agent_type: str, agent_address: str, database: str, queue: str): + if not agent_type and (agent_address or database or queue): + raise ValueError("Options '--database', '--queue', and 'agent_address' are valid only with '--agent_type'") + if agent_type: + if not database: + raise ValueError("Options '--agent_type' and '--database' are mutually inclusive") + allowed_agent_types = ['server', 'worker'] + if agent_type not in allowed_agent_types: + agents_str = ', '.join(allowed_agent_types) + raise ValueError(f"Wrong agent type parameter. Allowed agent types: {agents_str}") + if agent_type == 'server': + if not agent_address: + raise ValueError("Options '--agent_type=server' and '--agent_address' are mutually inclusive") + if queue: + raise ValueError("Options '--agent_type=server' and '--queue' are mutually exclusive") + if agent_type == 'worker': + if not queue: + raise ValueError("Options '--agent_type=worker' and '--queue' are mutually inclusive") + if agent_address: + raise ValueError("Options '--agent_type=worker' and '--agent_address' are mutually exclusive") + + processor = ProcessorClass(workspace=None, dump_json=True) + if agent_type == 'worker': + try: + # TODO: Passing processor_name and ocrd_tool is reduntant + processing_worker = ProcessingWorker( + rabbitmq_addr=queue, + mongodb_addr=database, + processor_name=processor.ocrd_tool['executable'], + ocrd_tool=processor.ocrd_tool, + processor_class=ProcessorClass, + ) + # The RMQConsumer is initialized and a connection to the RabbitMQ is performed + processing_worker.connect_consumer() + # Start consuming from the queue with name `processor_name` + processing_worker.start_consuming() + except Exception as e: + raise Exception("Processing worker has failed with error") from e + if agent_type == 'server': + try: + # TODO: Better validate that inside the ProcessorServer itself + host, port = agent_address.split(':') + processor_server = ProcessorServer( + mongodb_addr=database, + processor_name=processor.ocrd_tool['executable'], + processor_class=ProcessorClass, + ) + processor_server.run_server(host=host, port=int(port)) + except Exception as e: + raise Exception("Processor server has failed with error") from e diff --git a/ocrd/ocrd/decorators/ocrd_cli_options.py b/ocrd/ocrd/decorators/ocrd_cli_options.py index 2ba4bf8ae1..42bed275bb 100644 --- a/ocrd/ocrd/decorators/ocrd_cli_options.py +++ b/ocrd/ocrd/decorators/ocrd_cli_options.py @@ -1,3 +1,4 @@ +import click from click import option, Path from .parameter_option import parameter_option, parameter_override_option from .loglevel_option import loglevel_option @@ -33,6 +34,8 @@ def cli(mets_url): parameter_option, parameter_override_option, loglevel_option, + option('--agent_type', type=click.STRING), + option('--agent_address', type=click.STRING), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), option('-C', '--show-resource'), @@ -45,4 +48,3 @@ def cli(mets_url): for param in params: param(f) return f - diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index 16b89c66b8..c0b8c39edb 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -11,6 +11,7 @@ from typing import Dict, Union from paramiko import SSHClient from re import search as re_search +from os import getpid from time import sleep @@ -77,11 +78,30 @@ def deploy_hosts(self, rabbitmq_url: str, mongodb_url: str) -> None: host.config.keypath ) - # TODO: Call the _deploy_processor_server() here and adapt accordingly - for processor in host.config.processors: self._deploy_processing_worker(processor, host, rabbitmq_url, mongodb_url) + # TODO: This is not optimal - the entire method should be refactored! + if (any(s.deploy_type == DeployType.native for s in host.config.servers) + and not host.ssh_client): + host.ssh_client = create_ssh_client( + host.config.address, + host.config.username, + host.config.password, + host.config.keypath + ) + if (any(s.deploy_type == DeployType.docker for s in host.config.servers) + and not host.docker_client): + host.docker_client = create_docker_client( + host.config.address, + host.config.username, + host.config.password, + host.config.keypath + ) + + for server in host.config.servers: + self._deploy_processor_server(server, host, mongodb_url) + if host.ssh_client: host.ssh_client.close() if host.docker_client: @@ -89,8 +109,7 @@ def deploy_hosts(self, rabbitmq_url: str, mongodb_url: str) -> None: def _deploy_processing_worker(self, processor: WorkerConfig, host: HostData, rabbitmq_url: str, mongodb_url: str) -> None: - - self.log.debug(f"deploy '{processor.deploy_type}' worker: '{processor}' on '{host.config.address}'") + self.log.debug(f"deploy '{processor.deploy_type}' processing worker: '{processor.name}' on '{host.config.address}'") for _ in range(processor.count): if processor.deploy_type == DeployType.native: @@ -114,13 +133,28 @@ def _deploy_processing_worker(self, processor: WorkerConfig, host: HostData, host.pids_docker.append(pid) sleep(0.1) - def _deploy_processor_server(self, processor: WorkerConfig, host: HostData, mongodb_url: str) -> None: - self.log.debug(f"deploy '{processor.deploy_type}' processor server: '{processor}' on '{host.config.address}'") - - - - # TODO: Method for deploying a processor server - pass + # TODO: Revisit this to remove code duplications of deploy_* methods + def _deploy_processor_server(self, server: ProcessorServerConfig, host: HostData, mongodb_url: str) -> None: + self.log.debug(f"deploy '{server.deploy_type}' processor server: '{server.name}' on '{host.config.address}'") + if server.deploy_type == DeployType.native: + assert host.ssh_client + pid = self.start_native_processor_server( + client=host.ssh_client, + processor_name=server.name, + agent_address=f'{host.config.address}:{server.port}', + database_url=mongodb_url, + ) + host.processor_server_pids_native.append(pid) + + if server.name in host.processor_server_ports: + if host.processor_server_ports[server.name]: + host.processor_server_ports[server.name] = host.processor_server_ports[server.name].append(server.port) + else: + host.processor_server_ports[server.name] = [server.port] + else: + host.processor_server_ports[server.name] = [server.port] + else: + raise Exception("Deploying docker processor server is not supported yet!") def deploy_rabbitmq(self, image: str, detach: bool, remove: bool, ports_mapping: Union[Dict, None] = None) -> str: @@ -261,18 +295,43 @@ def kill_hosts(self) -> None: host.docker_client = create_docker_client(host.config.address, host.config.username, host.config.password, host.config.keypath) # Kill deployed OCR-D processor instances on this Processing worker host - self.kill_processing_worker(host) - - def kill_processing_worker(self, host: HostData) -> None: - for pid in host.pids_native: - self.log.debug(f"Trying to kill/stop native processor: with PID: '{pid}'") - host.ssh_client.exec_command(f'kill {pid}') - host.pids_native = [] - - for pid in host.pids_docker: - self.log.debug(f"Trying to kill/stop docker container with PID: '{pid}'") - host.docker_client.containers.get(pid).stop() - host.pids_docker = [] + self.kill_processing_workers(host) + + # Kill deployed Processor Server instances on this host + self.kill_processor_servers(host) + + # TODO: Optimize the code duplication from start_* and kill_* methods + def kill_processing_workers(self, host: HostData) -> None: + amount = len(host.pids_native) + if amount: + self.log.info(f"Trying to kill/stop {amount} native processing workers:") + for pid in host.pids_native: + self.log.info(f"Native with PID: '{pid}'") + host.ssh_client.exec_command(f'kill {pid}') + host.pids_native = [] + amount = len(host.pids_docker) + if amount: + self.log.info(f"Trying to kill/stop {amount} docker processing workers:") + for pid in host.pids_docker: + self.log.info(f"Docker with PID: '{pid}'") + host.docker_client.containers.get(pid).stop() + host.pids_docker = [] + + def kill_processor_servers(self, host: HostData) -> None: + amount = len(host.processor_server_pids_native) + if amount: + self.log.info(f"Trying to kill/stop {amount} native processor servers:") + for pid in host.processor_server_pids_native: + self.log.info(f"Native with PID: '{pid}'") + host.ssh_client.exec_command(f'kill {pid}') + host.processor_server_pids_native = [] + amount = len(host.processor_server_pids_docker) + if amount: + self.log.info(f"Trying to kill/stop {amount} docker processor servers:") + for pid in host.processor_server_pids_docker: + self.log.info(f"Docker with PID: '{pid}'") + host.docker_client.containers.get(pid).stop() + host.processor_server_pids_docker = [] def start_native_processor(self, client: SSHClient, processor_name: str, queue_url: str, database_url: str) -> str: @@ -287,17 +346,17 @@ def start_native_processor(self, client: SSHClient, processor_name: str, queue_u Returns: str: pid of running process """ - self.log.info(f'Starting native processor: {processor_name}') + self.log.info(f'Starting native processing worker: {processor_name}') channel = client.invoke_shell() stdin, stdout = channel.makefile('wb'), channel.makefile('rb') - cmd = f'{processor_name} --database {database_url} --queue {queue_url}' + cmd = f'{processor_name} --agent_type worker --database {database_url} --queue {queue_url}' # the only way (I could find) to make it work to start a process in the background and # return early is this construction. The pid of the last started background process is # printed with `echo $!` but it is printed inbetween other output. Because of that I added # `xyz` before and after the code to easily be able to filter out the pid via regex when # returning from the function logpath = '/tmp/ocrd-processing-server-startup.log' - stdin.write(f"echo starting processor with '{cmd}' >> '{logpath}'\n") + stdin.write(f"echo starting processing worker with '{cmd}' >> '{logpath}'\n") stdin.write(f'{cmd} >> {logpath} 2>&1 &\n') stdin.write('echo xyz$!xyz \n exit \n') output = stdout.read().decode('utf-8') @@ -307,8 +366,31 @@ def start_native_processor(self, client: SSHClient, processor_name: str, queue_u def start_docker_processor(self, client: CustomDockerClient, processor_name: str, queue_url: str, database_url: str) -> str: + + # TODO: Raise an exception here as well? + # raise Exception("Deploying docker processing worker is not supported yet!") + self.log.info(f'Starting docker container processor: {processor_name}') # TODO: add real command here to start processing server in docker here res = client.containers.run('debian', 'sleep 500s', detach=True, remove=True) assert res and res.id, f'Running processor: {processor_name} in docker-container failed' return res.id + + # TODO: Just a copy of the above start_native_processor() method. + # Far from being great... But should be good as a starting point + def start_native_processor_server(self, client: SSHClient, processor_name: str, agent_address: str, database_url: str) -> str: + self.log.info(f"Starting native processor server: {processor_name} on {agent_address}") + channel = client.invoke_shell() + stdin, stdout = channel.makefile('wb'), channel.makefile('rb') + cmd = f'{processor_name} --agent_type server --agent_address {agent_address} --database {database_url}' + port = agent_address.split(':')[1] + logpath = f'/tmp/server_{processor_name}_{port}_{getpid()}.log' + # TODO: This entire stdin/stdout thing is broken with servers! + stdin.write(f"echo starting processor server with '{cmd}' >> '{logpath}'\n") + stdin.write(f'{cmd} >> {logpath} 2>&1 &\n') + stdin.write('echo xyz$!xyz \n exit \n') + output = stdout.read().decode('utf-8') + stdout.close() + stdin.close() + return re_search(r'xyz([0-9]+)xyz', output).group(1) # type: ignore + pass diff --git a/ocrd_network/ocrd_network/deployment_config.py b/ocrd_network/ocrd_network/deployment_config.py index 48b123d1af..de5465a915 100644 --- a/ocrd_network/ocrd_network/deployment_config.py +++ b/ocrd_network/ocrd_network/deployment_config.py @@ -8,6 +8,7 @@ 'HostConfig', 'WorkerConfig', 'MongoConfig', + 'ProcessorServerConfig', 'QueueConfig', ] @@ -49,6 +50,12 @@ def __init__(self, config: dict) -> None: self.processors.append( WorkerConfig(worker['name'], worker['number_of_instance'], deploy_type) ) + self.servers = [] + for server in config['servers']: + deploy_type = DeployType.from_str(server['deploy_type']) + self.servers.append( + ProcessorServerConfig(server['name'], deploy_type, server['port']) + ) class WorkerConfig: @@ -61,6 +68,15 @@ def __init__(self, name: str, count: int, deploy_type: DeployType) -> None: self.deploy_type = deploy_type +# TODO: Not a big fan of the way these configs work... +# Implemented this way to fit the general logic of previous impl +class ProcessorServerConfig: + def __init__(self, name: str, deploy_type: DeployType, port: int): + self.name = name + self.deploy_type = deploy_type + self.port = port + + class MongoConfig: """ Class to hold information for Mongodb-Docker container """ diff --git a/ocrd_network/ocrd_network/deployment_utils.py b/ocrd_network/ocrd_network/deployment_utils.py index 6b943127b4..8f6c5f2724 100644 --- a/ocrd_network/ocrd_network/deployment_utils.py +++ b/ocrd_network/ocrd_network/deployment_utils.py @@ -16,8 +16,7 @@ 'create_ssh_client', 'CustomDockerClient', 'DeployType', - 'HostData', - 'is_bashlib_processor' + 'HostData' ] @@ -38,7 +37,6 @@ def create_docker_client(address: str, username: str, password: Union[str, None] return CustomDockerClient(username, address, password=password, keypath=keypath) - class HostData: """class to store runtime information for a host """ @@ -48,6 +46,11 @@ def __init__(self, config: HostConfig) -> None: self.docker_client: Union[CustomDockerClient, None] = None self.pids_native: List[str] = [] self.pids_docker: List[str] = [] + # TODO: Revisit this, currently just mimicking the old impl + self.processor_server_pids_native: List[str] = [] + self.processor_server_pids_docker: List[str] = [] + # Key: processor_name, Value: list of ports + self.processor_server_ports: dict = {} @staticmethod def from_config(config: List[HostConfig]) -> List[HostData]: diff --git a/ocrd_network/ocrd_network/models/job.py b/ocrd_network/ocrd_network/models/job.py index 3c0857c370..1ccc82fe9b 100644 --- a/ocrd_network/ocrd_network/models/job.py +++ b/ocrd_network/ocrd_network/models/job.py @@ -25,6 +25,9 @@ class PYJobInput(BaseModel): parameters: dict = {} # Always set to empty dict when None, otherwise it fails ocr-d-validation result_queue_name: Optional[str] = None callback_url: Optional[str] = None + # Used to toggle between sending requests to 'worker and 'server', + # i.e., Processing Worker and Processor Server, respectively + agent_type: Optional[str] = 'worker' class Config: schema_extra = { diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 582c1134cc..43fa5776f9 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -1,4 +1,5 @@ -from typing import Dict +import requests +from typing import Dict, List import uvicorn from fastapi import FastAPI, status, Request, HTTPException @@ -56,8 +57,14 @@ def __init__(self, config_path: str, host: str, port: int) -> None: # Gets assigned when `connect_publisher` is called on the working object self.rmq_publisher = None - # This list holds all processors mentioned in the config file - self._processor_list = None + # TODO: These will change dynamically + # according to the new requirements + # This list holds a set of all processing worker + # names mentioned in the config file + self._processing_workers_list = None + # This list holds a set of all processor server + # names mentioned in the config file + self._processor_servers_list = None # Create routes self.router.add_api_route( @@ -193,15 +200,27 @@ def create_message_queues(self) -> None: self.rmq_publisher.create_queue(queue_name=processor.name) @property - def processor_list(self): - if self._processor_list: - return self._processor_list + def processing_workers_list(self): + if self._processing_workers_list: + return self._processing_workers_list res = set([]) for host in self.config.hosts: for processor in host.processors: res.add(processor.name) - self._processor_list = list(res) - return self._processor_list + self._processing_workers_list = list(res) + return self._processing_workers_list + + # TODO: Revisit. This is just mimicking the method above. + @property + def processor_servers_list(self): + if self._processor_servers_list: + return self._processor_servers_list + res = set([]) + for host in self.config.hosts: + for processor_server in host.servers: + res.add(processor_server.name) + self._processor_servers_list = list(res) + return self._processor_server_list @staticmethod def create_processing_message(job: DBProcessorJob) -> OcrdProcessingMessage: @@ -221,12 +240,47 @@ def create_processing_message(job: DBProcessorJob) -> OcrdProcessingMessage: return processing_message async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJobOutput: - """ Queue a processor job - """ + if data.agent_type not in ['worker', 'server']: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Unknown network agent with value: {data.agent_type}" + ) + + job_output = None + if data.agent_type == 'worker': + job_output = await self.push_to_processing_queue(processor_name, data) + if data.agent_type == 'server': + job_output = await self.push_to_processor_server(processor_name, data) + if not job_output: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to create job output" + ) + return job_output + + # TODO: Revisit and remove duplications between push_to_* methods + async def push_to_processing_queue(self, processor_name: str, data: PYJobInput) -> PYJobOutput: + # Validate existence of the Workspace in the DB + if bool(data.path_to_mets) == bool(data.workspace_id): + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail="Either 'path' or 'workspace_id' must be provided, but not both" + ) + # This check is done to return early in case + # the workspace_id is provided but not existing in the DB + elif data.workspace_id: + try: + await db_get_workspace(data.workspace_id) + except ValueError: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Workspace with id '{data.workspace_id}' not existing" + ) + if not self.rmq_publisher: raise Exception('RMQPublisher is not connected') - if processor_name not in self.processor_list: + if processor_name not in self._processing_workers_list: try: # Only checks if the process queue exists, if not raises ChannelClosedByBroker self.rmq_publisher.create_queue(processor_name, passive=True) @@ -240,7 +294,7 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ detail=f"Process queue with id '{processor_name}' not existing" ) - # validate parameters + # TODO: Getting the tool shall be adapted to the change in #1028 ocrd_tool = get_ocrd_tool_json(processor_name) if not ocrd_tool: raise HTTPException( @@ -251,6 +305,27 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ if not report.is_valid: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) + job = DBProcessorJob( + **data.dict(exclude_unset=True, exclude_none=True), + job_id=generate_id(), + processor_name=processor_name, + state=StateEnum.queued + ) + await job.insert() + processing_message = self.create_processing_message(job) + encoded_processing_message = OcrdProcessingMessage.encode_yml(processing_message) + + try: + self.rmq_publisher.publish_to_queue(processor_name, encoded_processing_message) + except Exception as error: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f'RMQPublisher has failed: {error}' + ) + return job.to_job_output() + + async def push_to_processor_server(self, processor_name: str, data: PYJobInput) -> PYJobOutput: + # Validate existence of the Workspace in the DB if bool(data.path_to_mets) == bool(data.workspace_id): raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, @@ -267,29 +342,53 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ detail=f"Workspace with id '{data.workspace_id}' not existing" ) - job = DBProcessorJob( - **data.dict(exclude_unset=True, exclude_none=True), - job_id=generate_id(), - processor_name=processor_name, - state=StateEnum.queued - ) - await job.insert() - processing_message = self.create_processing_message(job) - encoded_processing_message = OcrdProcessingMessage.encode_yml(processing_message) + processor_server_url = None - try: - self.rmq_publisher.publish_to_queue(processor_name, encoded_processing_message) - except Exception as error: + # Check if a processor server with processor_name was deployed + # TODO: Revisit when the config file classes are refactored (made more abstract). + # This is such a mess now due to the bad abstraction and bad naming conventions! + for host_config in self.config.hosts: + for processor_server in host_config.servers: + if processor_server.name == processor_name: + processor_server_url = f"http://{host_config.address}:{processor_server.port}/" + + if not processor_server_url: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f'RMQPublisher has failed: {error}' + detail=f"Processor Server of '{processor_name}' is not available" ) - return job.to_job_output() + + # Request the tool json from the Processor Server + response = requests.get(processor_server_url, headers={'Accept': 'application/json'}) + if not response.status_code == 200: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to retrieve '{processor_name}' from: {processor_server_url}" + ) + ocrd_tool = response.json() + if not ocrd_tool: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to retrieve ocrd tool json of '{processor_name}' from: {processor_server_url}" + ) + report = ParameterValidator(ocrd_tool).validate(dict(data.parameters)) + if not report.is_valid: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) + + # Post a processing job to the Processor Server + response = requests.post(processor_server_url, headers={'Accept': 'application/json'}, json=data) + if not response.status_code == 202: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to post '{processor_name}' job to: {processor_server_url}" + ) + job_output = response.json + return job_output async def get_processor_info(self, processor_name) -> Dict: """ Return a processor's ocrd-tool.json """ - if processor_name not in self.processor_list: + if processor_name not in self._processing_workers_list: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail='Processor not available' @@ -308,7 +407,16 @@ async def get_job(self, processor_name: str, job_id: str) -> PYJobOutput: detail=f"Processing job with id '{job_id}' of processor type '{processor_name}' not existing" ) - async def list_processors(self) -> str: + async def list_processors(self) -> List[str]: """ Return a list of all available processors """ - return self.processor_list + processor_names_list = [] + + # TODO: 1) Revisit this. Currently, it adds labels in + # front of the names for differentiation purposes + # TODO: 2) This could be optimized by holding a dynamic list + for worker_name in self._processing_workers_list: + processor_names_list.append(f'worker {worker_name}') + for server_name in self._processor_servers_list: + processor_names_list.append(f'server {server_name}') + return processor_names_list diff --git a/ocrd_network/ocrd_network/processing_worker.py b/ocrd_network/ocrd_network/processing_worker.py index 8b3681f18f..ef385bffa6 100644 --- a/ocrd_network/ocrd_network/processing_worker.py +++ b/ocrd_network/ocrd_network/processing_worker.py @@ -19,7 +19,7 @@ import pika.adapters.blocking_connection from ocrd import Resolver -from ocrd_utils import getLogger +from ocrd_utils import getLogger, initLogging from ocrd.processor.helpers import run_cli, run_processor from .database import ( @@ -55,6 +55,7 @@ class ProcessingWorker: def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None: + initLogging() self.log = getLogger(__name__) # TODO: Provide more flexibility for configuring file logging (i.e. via ENV variables) file_handler = logging.FileHandler(f'/tmp/worker_{processor_name}_{getpid()}.log', mode='a') diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 03f6cadb7d..50d94972c6 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -1,6 +1,4 @@ -from contextlib import redirect_stdout from datetime import datetime -from io import StringIO import json import logging from os import environ, getpid @@ -14,9 +12,9 @@ from ocrd import Resolver from ocrd_validators import ParameterValidator from ocrd_utils import ( + initLogging, getLogger, - get_ocrd_tool_json, - parse_json_string_with_comments + get_ocrd_tool_json ) from .database import ( @@ -51,14 +49,8 @@ class ProcessorServer(FastAPI): def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class=None): if not (processor_name or processor_class): raise ValueError('Either "processor_name" or "processor_class" must be provided') - + initLogging() self.log = getLogger(__name__) - # TODO: Provide more flexibility for configuring file logging (i.e. via ENV variables) - file_handler = logging.FileHandler(f'/tmp/server_{processor_name}_{getpid()}.log', mode='a') - logging_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - file_handler.setFormatter(logging.Formatter(logging_format)) - file_handler.setLevel(logging.DEBUG) - self.log.addHandler(file_handler) self.db_url = mongodb_addr self.processor_name = processor_name @@ -105,7 +97,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= self.router.add_api_route( path='/', - endpoint=self.process, + endpoint=self.create_processor_job_task, methods=['POST'], tags=['Processing'], status_code=status.HTTP_202_ACCEPTED, @@ -141,7 +133,7 @@ async def get_processor_info(self): # Note: The Processing server pushes to a queue, while # the Processor Server creates (pushes to) a background task - async def push_processor_job(self, data: PYJobInput, background_tasks: BackgroundTasks): + async def create_processor_job_task(self, data: PYJobInput, background_tasks: BackgroundTasks): if not self.ocrd_tool: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, @@ -216,10 +208,7 @@ def get_ocrd_tool(self): if self.ocrd_tool: return self.ocrd_tool if self.ProcessorClass: - str_out = StringIO() - with redirect_stdout(str_out): - self.ProcessorClass(workspace=None, dump_json=True) - ocrd_tool = parse_json_string_with_comments(str_out.getvalue()) + ocrd_tool = self.ProcessorClass(workspace=None, version=True).ocrd_tool else: ocrd_tool = get_ocrd_tool_json(self.processor_name) return ocrd_tool @@ -228,10 +217,7 @@ def get_version(self) -> str: if self.version: return self.version if self.ProcessorClass: - str_out = StringIO() - with redirect_stdout(str_out): - self.ProcessorClass(workspace=None, show_version=True) - version_str = str_out.getvalue() + version_str = self.ProcessorClass(workspace=None, version=True).version else: version_str = run( [self.processor_name, '--version'], @@ -241,8 +227,14 @@ def get_version(self) -> str: ).stdout return version_str - def run_server(self, host, port): - uvicorn.run(self, host=host, port=port, access_log=False) + def run_server(self, host, port, access_log=False): + # TODO: Provide more flexibility for configuring file logging (i.e. via ENV variables) + file_handler = logging.FileHandler(f'/tmp/server_{self.processor_name}_{port}_{getpid()}.log', mode='a') + logging_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + file_handler.setFormatter(logging.Formatter(logging_format)) + file_handler.setLevel(logging.DEBUG) + self.log.addHandler(file_handler) + uvicorn.run(self, host=host, port=port, access_log=access_log) async def run_cli_from_server( self, diff --git a/ocrd_validators/ocrd_validators/processing_server_config.schema.yml b/ocrd_validators/ocrd_validators/processing_server_config.schema.yml index d28b63a3d7..b2fe06cb49 100644 --- a/ocrd_validators/ocrd_validators/processing_server_config.schema.yml +++ b/ocrd_validators/ocrd_validators/processing_server_config.schema.yml @@ -58,6 +58,7 @@ properties: - address - username - workers + - servers oneOf: - required: - password @@ -75,7 +76,7 @@ properties: description: Path to private key file type: string workers: - description: List of workers which will be deployed + description: List of processing workers that will be deployed type: array minItems: 1 items: @@ -97,12 +98,41 @@ properties: minimum: 1 default: 1 deploy_type: - description: Should the processor be deployed natively or with Docker + description: Should the processing worker be deployed natively or with Docker type: string enum: - native - docker default: native + servers: + description: List of processor servers that will be deployed + type: array + minItems: 1 + items: + type: object + additionalProperties: false + required: + - name + - port + properties: + name: + description: Name of the processor + type: string + pattern: "^ocrd-.*$" + examples: + - ocrd-cis-ocropy-binarize + - ocrd-olena-binarize + deploy_type: + description: Should the processor server natively or with Docker + type: string + enum: + - native + - docker + default: native + port: + description: The port number to be deployed on the host + $ref: "#/$defs/port" + $defs: address: type: string From 6476ebb256e52a142b2fe5f635c98291fb42ae7b Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 29 Mar 2023 12:39:14 +0200 Subject: [PATCH 06/34] fix some nasty errors --- ocrd/ocrd/decorators/__init__.py | 132 +++++++++--------- .../ocrd_network/processing_server.py | 11 +- ocrd_network/ocrd_network/processor_server.py | 22 +-- 3 files changed, 88 insertions(+), 77 deletions(-) diff --git a/ocrd/ocrd/decorators/__init__.py b/ocrd/ocrd/decorators/__init__.py index 1aaeedaf19..a2f2d3cd0a 100644 --- a/ocrd/ocrd/decorators/__init__.py +++ b/ocrd/ocrd/decorators/__init__.py @@ -61,73 +61,73 @@ def ocrd_cli_wrap_processor( ) sys.exit() - # Used for checking/starting network agents for the WebAPI architecture - # Has no side effects if neither of the 4 ocrd_network parameters are passed - check_and_run_network_agent(processorClass, agent_type, agent_address, database, queue) - initLogging() - LOG = getLogger('ocrd_cli_wrap_processor') - # LOG.info('kwargs=%s' % kwargs) - # Merge parameter overrides and parameters - if 'parameter_override' in kwargs: - set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) - # TODO OCR-D/core#274 - # Assert -I / -O - # if not kwargs['input_file_grp']: - # raise ValueError('-I/--input-file-grp is required') - # if not kwargs['output_file_grp']: - # raise ValueError('-O/--output-file-grp is required') - resolver = Resolver() - working_dir, mets, _ = resolver.resolve_mets_arguments(working_dir, mets, None) - workspace = resolver.workspace_from_url(mets, working_dir) - page_id = kwargs.get('page_id') - # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 - # if overwrite - # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: - # raise Exception("--overwrite requires --output-file-grp") - # LOG.info("Removing files because of --overwrite") - # for grp in kwargs['output_file_grp'].split(','): - # if page_id: - # for one_page_id in kwargs['page_id'].split(','): - # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) - # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): - # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) - # else: - # LOG.debug("Removing all files in output file group %s ", grp) - # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) - # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) - # workspace.save_mets() - # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace - if overwrite: - workspace.overwrite_mode = True - report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) - if not report.is_valid: - raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) - # Set up profiling behavior from environment variables/flags - if not profile and 'OCRD_PROFILE' in environ: - if 'CPU' in environ['OCRD_PROFILE']: - profile = True - if not profile_file and 'OCRD_PROFILE_FILE' in environ: - profile_file = environ['OCRD_PROFILE_FILE'] - if profile or profile_file: - import cProfile - import pstats - import io - import atexit - print("Profiling...") - pr = cProfile.Profile() - pr.enable() - def exit(): - pr.disable() - print("Profiling completed") - if profile_file: - with open(profile_file, 'wb') as f: - pr.dump_stats(profile_file) - s = io.StringIO() - pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() - print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + + if agent_type: + check_and_run_network_agent(processorClass, agent_type, agent_address, database, queue) + else: + LOG = getLogger('ocrd_cli_wrap_processor') + # LOG.info('kwargs=%s' % kwargs) + # Merge parameter overrides and parameters + if 'parameter_override' in kwargs: + set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) + # TODO OCR-D/core#274 + # Assert -I / -O + # if not kwargs['input_file_grp']: + # raise ValueError('-I/--input-file-grp is required') + # if not kwargs['output_file_grp']: + # raise ValueError('-O/--output-file-grp is required') + resolver = Resolver() + working_dir, mets, _ = resolver.resolve_mets_arguments(working_dir, mets, None) + workspace = resolver.workspace_from_url(mets, working_dir) + page_id = kwargs.get('page_id') + # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 + # if overwrite + # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: + # raise Exception("--overwrite requires --output-file-grp") + # LOG.info("Removing files because of --overwrite") + # for grp in kwargs['output_file_grp'].split(','): + # if page_id: + # for one_page_id in kwargs['page_id'].split(','): + # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) + # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): + # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) + # else: + # LOG.debug("Removing all files in output file group %s ", grp) + # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) + # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) + # workspace.save_mets() + # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace + if overwrite: + workspace.overwrite_mode = True + report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) + if not report.is_valid: + raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) + # Set up profiling behavior from environment variables/flags + if not profile and 'OCRD_PROFILE' in environ: + if 'CPU' in environ['OCRD_PROFILE']: + profile = True + if not profile_file and 'OCRD_PROFILE_FILE' in environ: + profile_file = environ['OCRD_PROFILE_FILE'] + if profile or profile_file: + import cProfile + import pstats + import io + import atexit + print("Profiling...") + pr = cProfile.Profile() + pr.enable() + def exit(): + pr.disable() + print("Profiling completed") + if profile_file: + with open(profile_file, 'wb') as f: + pr.dump_stats(profile_file) + s = io.StringIO() + pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() + print(s.getvalue()) + atexit.register(exit) + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) def check_and_run_network_agent(ProcessorClass, agent_type: str, agent_address: str, database: str, queue: str): diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 43fa5776f9..d4221dcb19 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -1,3 +1,4 @@ +import json import requests from typing import Dict, List import uvicorn @@ -375,8 +376,16 @@ async def push_to_processor_server(self, processor_name: str, data: PYJobInput) if not report.is_valid: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) + try: + json_data = json.dumps(data.dict(exclude_unset=True, exclude_none=True)) + except Exception as e: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to json dump the PYJobInput, error: {e}" + ) + # Post a processing job to the Processor Server - response = requests.post(processor_server_url, headers={'Accept': 'application/json'}, json=data) + response = requests.post(processor_server_url, headers={'Accept': 'application/json'}, json=json_data) if not response.status_code == 202: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 50d94972c6..70e7529285 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -12,7 +12,6 @@ from ocrd import Resolver from ocrd_validators import ParameterValidator from ocrd_utils import ( - initLogging, getLogger, get_ocrd_tool_json ) @@ -49,7 +48,6 @@ class ProcessorServer(FastAPI): def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class=None): if not (processor_name or processor_class): raise ValueError('Either "processor_name" or "processor_class" must be provided') - initLogging() self.log = getLogger(__name__) self.db_url = mongodb_addr @@ -172,10 +170,11 @@ async def create_processor_job_task(self, data: PYJobInput, background_tasks: Ba # Run the processor in the background background_tasks.add_task( self.run_processor_from_server, - job_id=job.id, + job_id=job.job_id, workspace_id=data.workspace_id, + path_to_mets=data.path_to_mets, page_id=data.page_id, - parameter=data.parameters, + parameters=data.parameters, input_file_grps=data.input_file_grps, output_file_grps=data.output_file_grps, ) @@ -183,12 +182,13 @@ async def create_processor_job_task(self, data: PYJobInput, background_tasks: Ba # Run the CLI in the background background_tasks.add_task( self.run_cli_from_server, - job_id=job.id, + job_id=job.job_id, workspace_id=data.workspace_id, + path_to_mets=data.path_to_mets, page_id=data.page_id, input_file_grps=data.input_file_grps, output_file_grps=data.output_file_grps, - parameter=data.parameters + parameters=data.parameters ) return job.to_job_output() @@ -241,6 +241,7 @@ async def run_cli_from_server( job_id: str, processor_name: str, workspace_id: str, + path_to_mets: str, input_file_grps: List[str], output_file_grps: List[str], page_id: str, @@ -252,8 +253,8 @@ async def run_cli_from_server( input_file_grps_str = ','.join(input_file_grps) output_file_grps_str = ','.join(output_file_grps) - workspace_db = await db_get_workspace(workspace_id) - path_to_mets = workspace_db.workspace_mets_path + if not path_to_mets and workspace_id: + path_to_mets = await db_get_workspace(workspace_id).workspace_mets_path workspace = Resolver().workspace_from_url(path_to_mets) start_time = datetime.now() @@ -293,6 +294,7 @@ async def run_processor_from_server( self, job_id: str, workspace_id: str, + path_to_mets: str, input_file_grps: List[str], output_file_grps: List[str], page_id: str, @@ -304,8 +306,8 @@ async def run_processor_from_server( input_file_grps_str = ','.join(input_file_grps) output_file_grps_str = ','.join(output_file_grps) - workspace_db = await db_get_workspace(workspace_id) - path_to_mets = workspace_db.workspace_mets_path + if not path_to_mets and workspace_id: + path_to_mets = await db_get_workspace(workspace_id).workspace_mets_path workspace = Resolver().workspace_from_url(path_to_mets) is_success = True From 0f8367cd21059348a6a47180b26e2731a8e4e809 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 29 Mar 2023 12:46:37 +0200 Subject: [PATCH 07/34] use cleaner approach with sys.exit() --- ocrd/ocrd/decorators/__init__.py | 135 ++++++++++++++++--------------- 1 file changed, 68 insertions(+), 67 deletions(-) diff --git a/ocrd/ocrd/decorators/__init__.py b/ocrd/ocrd/decorators/__init__.py index a2f2d3cd0a..4439305a42 100644 --- a/ocrd/ocrd/decorators/__init__.py +++ b/ocrd/ocrd/decorators/__init__.py @@ -63,71 +63,72 @@ def ocrd_cli_wrap_processor( initLogging() - if agent_type: - check_and_run_network_agent(processorClass, agent_type, agent_address, database, queue) - else: - LOG = getLogger('ocrd_cli_wrap_processor') - # LOG.info('kwargs=%s' % kwargs) - # Merge parameter overrides and parameters - if 'parameter_override' in kwargs: - set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) - # TODO OCR-D/core#274 - # Assert -I / -O - # if not kwargs['input_file_grp']: - # raise ValueError('-I/--input-file-grp is required') - # if not kwargs['output_file_grp']: - # raise ValueError('-O/--output-file-grp is required') - resolver = Resolver() - working_dir, mets, _ = resolver.resolve_mets_arguments(working_dir, mets, None) - workspace = resolver.workspace_from_url(mets, working_dir) - page_id = kwargs.get('page_id') - # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 - # if overwrite - # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: - # raise Exception("--overwrite requires --output-file-grp") - # LOG.info("Removing files because of --overwrite") - # for grp in kwargs['output_file_grp'].split(','): - # if page_id: - # for one_page_id in kwargs['page_id'].split(','): - # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) - # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): - # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) - # else: - # LOG.debug("Removing all files in output file group %s ", grp) - # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) - # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) - # workspace.save_mets() - # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace - if overwrite: - workspace.overwrite_mode = True - report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) - if not report.is_valid: - raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) - # Set up profiling behavior from environment variables/flags - if not profile and 'OCRD_PROFILE' in environ: - if 'CPU' in environ['OCRD_PROFILE']: - profile = True - if not profile_file and 'OCRD_PROFILE_FILE' in environ: - profile_file = environ['OCRD_PROFILE_FILE'] - if profile or profile_file: - import cProfile - import pstats - import io - import atexit - print("Profiling...") - pr = cProfile.Profile() - pr.enable() - def exit(): - pr.disable() - print("Profiling completed") - if profile_file: - with open(profile_file, 'wb') as f: - pr.dump_stats(profile_file) - s = io.StringIO() - pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() - print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + # Used for checking/starting network agents for the WebAPI architecture + # Has no side effects if neither of the 4 ocrd_network parameters are passed + check_and_run_network_agent(processorClass, agent_type, agent_address, database, queue) + + LOG = getLogger('ocrd_cli_wrap_processor') + # LOG.info('kwargs=%s' % kwargs) + # Merge parameter overrides and parameters + if 'parameter_override' in kwargs: + set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) + # TODO OCR-D/core#274 + # Assert -I / -O + # if not kwargs['input_file_grp']: + # raise ValueError('-I/--input-file-grp is required') + # if not kwargs['output_file_grp']: + # raise ValueError('-O/--output-file-grp is required') + resolver = Resolver() + working_dir, mets, _ = resolver.resolve_mets_arguments(working_dir, mets, None) + workspace = resolver.workspace_from_url(mets, working_dir) + page_id = kwargs.get('page_id') + # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 + # if overwrite + # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: + # raise Exception("--overwrite requires --output-file-grp") + # LOG.info("Removing files because of --overwrite") + # for grp in kwargs['output_file_grp'].split(','): + # if page_id: + # for one_page_id in kwargs['page_id'].split(','): + # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) + # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): + # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) + # else: + # LOG.debug("Removing all files in output file group %s ", grp) + # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) + # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) + # workspace.save_mets() + # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace + if overwrite: + workspace.overwrite_mode = True + report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) + if not report.is_valid: + raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) + # Set up profiling behavior from environment variables/flags + if not profile and 'OCRD_PROFILE' in environ: + if 'CPU' in environ['OCRD_PROFILE']: + profile = True + if not profile_file and 'OCRD_PROFILE_FILE' in environ: + profile_file = environ['OCRD_PROFILE_FILE'] + if profile or profile_file: + import cProfile + import pstats + import io + import atexit + print("Profiling...") + pr = cProfile.Profile() + pr.enable() + def exit(): + pr.disable() + print("Profiling completed") + if profile_file: + with open(profile_file, 'wb') as f: + pr.dump_stats(profile_file) + s = io.StringIO() + pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() + print(s.getvalue()) + atexit.register(exit) + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) def check_and_run_network_agent(ProcessorClass, agent_type: str, agent_address: str, database: str, queue: str): @@ -167,7 +168,7 @@ def check_and_run_network_agent(ProcessorClass, agent_type: str, agent_address: # Start consuming from the queue with name `processor_name` processing_worker.start_consuming() except Exception as e: - raise Exception("Processing worker has failed with error") from e + sys.exit(f"Processing worker has failed with error: {e}") if agent_type == 'server': try: # TODO: Better validate that inside the ProcessorServer itself @@ -179,4 +180,4 @@ def check_and_run_network_agent(ProcessorClass, agent_type: str, agent_address: ) processor_server.run_server(host=host, port=int(port)) except Exception as e: - raise Exception("Processor server has failed with error") from e + sys.exit(f"Processor server has failed with error: {e}") From 4151a400e95b5cfbbb438045205bf939367364c1 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 29 Mar 2023 12:51:55 +0200 Subject: [PATCH 08/34] remove initLogging from processing worker --- ocrd_network/ocrd_network/processing_worker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ocrd_network/ocrd_network/processing_worker.py b/ocrd_network/ocrd_network/processing_worker.py index ef385bffa6..8b3681f18f 100644 --- a/ocrd_network/ocrd_network/processing_worker.py +++ b/ocrd_network/ocrd_network/processing_worker.py @@ -19,7 +19,7 @@ import pika.adapters.blocking_connection from ocrd import Resolver -from ocrd_utils import getLogger, initLogging +from ocrd_utils import getLogger from ocrd.processor.helpers import run_cli, run_processor from .database import ( @@ -55,7 +55,6 @@ class ProcessingWorker: def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None: - initLogging() self.log = getLogger(__name__) # TODO: Provide more flexibility for configuring file logging (i.e. via ENV variables) file_handler = logging.FileHandler(f'/tmp/worker_{processor_name}_{getpid()}.log', mode='a') From 36fec4502b0b675887e9ebc43b61dfb1cdc7c9e0 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 29 Mar 2023 13:09:56 +0200 Subject: [PATCH 09/34] log exceptions of the processing server --- ocrd/ocrd/decorators/__init__.py | 8 +++----- ocrd_network/ocrd_network/processing_server.py | 10 +++++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/ocrd/ocrd/decorators/__init__.py b/ocrd/ocrd/decorators/__init__.py index 4439305a42..ca5d308e9e 100644 --- a/ocrd/ocrd/decorators/__init__.py +++ b/ocrd/ocrd/decorators/__init__.py @@ -1,10 +1,5 @@ -from os.path import isfile from os import environ import sys -from contextlib import redirect_stdout -from io import StringIO - -import click from ocrd_utils import ( is_local_filename, @@ -152,6 +147,9 @@ def check_and_run_network_agent(ProcessorClass, agent_type: str, agent_address: if agent_address: raise ValueError("Options '--agent_type=worker' and '--agent_address' are mutually exclusive") + import logging + logging.getLogger('ocrd.network').setLevel(logging.DEBUG) + processor = ProcessorClass(workspace=None, dump_json=True) if agent_type == 'worker': try: diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index d4221dcb19..d064ea496a 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -253,9 +253,10 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ if data.agent_type == 'server': job_output = await self.push_to_processor_server(processor_name, data) if not job_output: + self.log.exception('Failed to create job output') raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Failed to create job output" + detail='Failed to create job output' ) return job_output @@ -298,12 +299,14 @@ async def push_to_processing_queue(self, processor_name: str, data: PYJobInput) # TODO: Getting the tool shall be adapted to the change in #1028 ocrd_tool = get_ocrd_tool_json(processor_name) if not ocrd_tool: + self.log.exception(f"Processor '{processor_name}' not available. Empty or missing ocrd_tool") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Processor '{processor_name}' not available. Empty or missing ocrd_tool" ) report = ParameterValidator(ocrd_tool).validate(dict(data.parameters)) if not report.is_valid: + self.log.exception(f"Invalid parameters for {processor_name}: {report.errors}") raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) job = DBProcessorJob( @@ -319,6 +322,7 @@ async def push_to_processing_queue(self, processor_name: str, data: PYJobInput) try: self.rmq_publisher.publish_to_queue(processor_name, encoded_processing_message) except Exception as error: + self.log.exception(f'RMQPublisher has failed: {error}') raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f'RMQPublisher has failed: {error}' @@ -362,12 +366,14 @@ async def push_to_processor_server(self, processor_name: str, data: PYJobInput) # Request the tool json from the Processor Server response = requests.get(processor_server_url, headers={'Accept': 'application/json'}) if not response.status_code == 200: + self.log.exception(f"Failed to retrieve '{processor_name}' from: {processor_server_url}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to retrieve '{processor_name}' from: {processor_server_url}" ) ocrd_tool = response.json() if not ocrd_tool: + self.log.exception(f"Failed to retrieve ocrd tool json of '{processor_name}' from: {processor_server_url}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to retrieve ocrd tool json of '{processor_name}' from: {processor_server_url}" @@ -379,6 +385,7 @@ async def push_to_processor_server(self, processor_name: str, data: PYJobInput) try: json_data = json.dumps(data.dict(exclude_unset=True, exclude_none=True)) except Exception as e: + self.log.exception(f"Failed to json dump the PYJobInput, error: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to json dump the PYJobInput, error: {e}" @@ -387,6 +394,7 @@ async def push_to_processor_server(self, processor_name: str, data: PYJobInput) # Post a processing job to the Processor Server response = requests.post(processor_server_url, headers={'Accept': 'application/json'}, json=json_data) if not response.status_code == 202: + self.log.exception(f"Failed to post '{processor_name}' job to: {processor_server_url}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to post '{processor_name}' job to: {processor_server_url}" From b2ad725b20f0f4bac3c5285e88df52de6e690a91 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 29 Mar 2023 14:08:15 +0200 Subject: [PATCH 10/34] fix serialization/deserialization --- ocrd_network/ocrd_network/processing_server.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index d064ea496a..fcf9d7c686 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -364,7 +364,10 @@ async def push_to_processor_server(self, processor_name: str, data: PYJobInput) ) # Request the tool json from the Processor Server - response = requests.get(processor_server_url, headers={'Accept': 'application/json'}) + response = requests.get( + processor_server_url, + headers={'Content-Type': 'application/json'} + ) if not response.status_code == 200: self.log.exception(f"Failed to retrieve '{processor_name}' from: {processor_server_url}") raise HTTPException( @@ -392,14 +395,19 @@ async def push_to_processor_server(self, processor_name: str, data: PYJobInput) ) # Post a processing job to the Processor Server - response = requests.post(processor_server_url, headers={'Accept': 'application/json'}, json=json_data) + response = requests.post( + processor_server_url, + headers={'Content-Type': 'application/json'}, + json=json.loads(json_data) + ) if not response.status_code == 202: self.log.exception(f"Failed to post '{processor_name}' job to: {processor_server_url}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to post '{processor_name}' job to: {processor_server_url}" ) - job_output = response.json + + job_output = response.json() return job_output async def get_processor_info(self, processor_name) -> Dict: From 275d98b4151994002ac8109604310c6a363c7dc9 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 29 Mar 2023 19:13:19 +0200 Subject: [PATCH 11/34] refactoring: first portion --- ocrd/ocrd/cli/processing_worker.py | 5 - ocrd/ocrd/cli/processor_server.py | 5 - ocrd_network/ocrd_network/process_helpers.py | 47 ++++ .../ocrd_network/processing_server.py | 199 +++++--------- .../ocrd_network/processing_worker.py | 131 ++-------- ocrd_network/ocrd_network/processor_server.py | 246 ++++-------------- ocrd_network/ocrd_network/server_utils.py | 67 +++++ ocrd_network/ocrd_network/utils.py | 14 + 8 files changed, 277 insertions(+), 437 deletions(-) create mode 100644 ocrd_network/ocrd_network/process_helpers.py create mode 100644 ocrd_network/ocrd_network/server_utils.py diff --git a/ocrd/ocrd/cli/processing_worker.py b/ocrd/ocrd/cli/processing_worker.py index 3e568b268a..75828732d1 100644 --- a/ocrd/ocrd/cli/processing_worker.py +++ b/ocrd/ocrd/cli/processing_worker.py @@ -20,11 +20,6 @@ @click.command('processing-worker') @click.argument('processor_name', required=True, type=click.STRING) -@click.option('--agent_type', - help='The type of this network agent', - default="worker", - type=click.STRING, - required=True) @click.option('-q', '--queue', default="amqp://admin:admin@localhost:5672/", help='The URL of the Queue Server, format: amqp://username:password@host:port/vhost', diff --git a/ocrd/ocrd/cli/processor_server.py b/ocrd/ocrd/cli/processor_server.py index cbff6a8f91..bd272478a4 100644 --- a/ocrd/ocrd/cli/processor_server.py +++ b/ocrd/ocrd/cli/processor_server.py @@ -17,11 +17,6 @@ @click.command('processor-server') @click.argument('processor_name', required=True, type=click.STRING) -@click.option('--agent_type', - help='The type of this network agent', - default="server", - type=click.STRING, - required=True) @click.option('--agent_address', help='The URL of the processor server, format: host:port', type=ProcessingServerParamType(), diff --git a/ocrd_network/ocrd_network/process_helpers.py b/ocrd_network/ocrd_network/process_helpers.py new file mode 100644 index 0000000000..28cd63a341 --- /dev/null +++ b/ocrd_network/ocrd_network/process_helpers.py @@ -0,0 +1,47 @@ +import json +from typing import List + +from ocrd import Resolver +from ocrd.processor.helpers import run_cli, run_processor + + +# A wrapper for run_processor() and run_cli() +def run_single_execution( + ProcessorClass, + executable: str, + abs_path_to_mets: str, + input_file_grps: List[str], + output_file_grps: List[str], + page_id: str, + parameters: dict, +) -> None: + if not (ProcessorClass or executable): + raise ValueError(f'Missing processor class and executable') + input_file_grps_str = ','.join(input_file_grps) + output_file_grps_str = ','.join(output_file_grps) + workspace = Resolver().workspace_from_url(abs_path_to_mets) + if ProcessorClass: + try: + run_processor( + processorClass=ProcessorClass, + workspace=workspace, + input_file_grp=input_file_grps_str, + output_file_grp=output_file_grps_str, + page_id=page_id, + parameter=parameters, + instance_caching=True + ) + except Exception as e: + raise RuntimeError(f"Python executable '{executable}' exited with: {e}") + else: + return_code = run_cli( + executable=executable, + workspace=workspace, + mets_url=abs_path_to_mets, + input_file_grp=input_file_grps_str, + output_file_grp=output_file_grps_str, + page_id=page_id, + parameter=json.dumps(parameters) + ) + if return_code != 0: + raise RuntimeError(f"CLI executable '{executable}' exited with: {return_code}") diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index fcf9d7c686..416da042b3 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -10,21 +10,21 @@ from pika.exceptions import ChannelClosedByBroker from ocrd_utils import getLogger, get_ocrd_tool_json -from ocrd_validators import ParameterValidator -from .database import ( - db_get_processing_job, - db_get_workspace, - initiate_database -) +from .database import initiate_database from .deployer import Deployer from .deployment_config import ProcessingServerConfig -from .rabbitmq_utils import RMQPublisher, OcrdProcessingMessage from .models import ( DBProcessorJob, PYJobInput, PYJobOutput, StateEnum ) +from .rabbitmq_utils import RMQPublisher, OcrdProcessingMessage +from .server_utils import ( + _get_processor_job, + validate_and_resolve_mets_path, + validate_job_input, +) from .utils import generate_created_time, generate_id @@ -90,7 +90,7 @@ def __init__(self, config_path: str, host: str, port: int) -> None: self.router.add_api_route( path='/processor/{processor_name}/{job_id}', - endpoint=self.get_job, + endpoint=self.get_processor_job, methods=['GET'], tags=['processing'], status_code=status.HTTP_200_OK, @@ -240,13 +240,55 @@ def create_processing_message(job: DBProcessorJob) -> OcrdProcessingMessage: ) return processing_message + def check_if_queue_exists(self, processor_name): + try: + # Only checks if the process queue exists, if not raises ChannelClosedByBroker + self.rmq_publisher.create_queue(processor_name, passive=True) + except ChannelClosedByBroker as error: + self.log.warning(f"Process queue with id '{processor_name}' not existing: {error}") + # Reconnect publisher - not efficient, but works + # TODO: Revisit when reconnection strategy is implemented + self.connect_publisher(enable_acks=True) + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Process queue with id '{processor_name}' not existing" + ) + + def query_ocrd_tool_json_from_server(self, processor_name): + processor_server_url = None + # Check if a processor server with processor_name was deployed + # TODO: Revisit when the config file classes are refactored (made more abstract). + # This is such a mess now due to the bad abstraction and bad naming conventions! + for host_config in self.config.hosts: + for processor_server in host_config.servers: + if processor_server.name == processor_name: + processor_server_url = f"http://{host_config.address}:{processor_server.port}/" + if not processor_server_url: + self.log.exception(f"Processor Server of '{processor_name}' is not available") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Processor Server of '{processor_name}' is not available" + ) + # Request the tool json from the Processor Server + response = requests.get( + processor_server_url, + headers={'Content-Type': 'application/json'} + ) + if not response.status_code == 200: + self.log.exception(f"Failed to retrieve '{processor_name}' from: {processor_server_url}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to retrieve '{processor_name}' from: {processor_server_url}" + ) + ocrd_tool = response.json() + return ocrd_tool, processor_server_url + async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJobOutput: if data.agent_type not in ['worker', 'server']: raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=f"Unknown network agent with value: {data.agent_type}" ) - job_output = None if data.agent_type == 'worker': job_output = await self.push_to_processing_queue(processor_name, data) @@ -261,56 +303,17 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ return job_output # TODO: Revisit and remove duplications between push_to_* methods - async def push_to_processing_queue(self, processor_name: str, data: PYJobInput) -> PYJobOutput: - # Validate existence of the Workspace in the DB - if bool(data.path_to_mets) == bool(data.workspace_id): - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail="Either 'path' or 'workspace_id' must be provided, but not both" - ) - # This check is done to return early in case - # the workspace_id is provided but not existing in the DB - elif data.workspace_id: - try: - await db_get_workspace(data.workspace_id) - except ValueError: - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Workspace with id '{data.workspace_id}' not existing" - ) - - if not self.rmq_publisher: - raise Exception('RMQPublisher is not connected') - - if processor_name not in self._processing_workers_list: - try: - # Only checks if the process queue exists, if not raises ChannelClosedByBroker - self.rmq_publisher.create_queue(processor_name, passive=True) - except ChannelClosedByBroker as error: - self.log.warning(f"Process queue with id '{processor_name}' not existing: {error}") - # Reconnect publisher - not efficient, but works - # TODO: Revisit when reconnection strategy is implemented - self.connect_publisher(enable_acks=True) - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Process queue with id '{processor_name}' not existing" - ) - + async def push_to_processing_queue(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput: # TODO: Getting the tool shall be adapted to the change in #1028 ocrd_tool = get_ocrd_tool_json(processor_name) - if not ocrd_tool: - self.log.exception(f"Processor '{processor_name}' not available. Empty or missing ocrd_tool") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Processor '{processor_name}' not available. Empty or missing ocrd_tool" - ) - report = ParameterValidator(ocrd_tool).validate(dict(data.parameters)) - if not report.is_valid: - self.log.exception(f"Invalid parameters for {processor_name}: {report.errors}") - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) - + validate_job_input(self.log, processor_name, ocrd_tool, job_input) + job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=False) + if not self.rmq_publisher: + raise Exception('RMQPublisher is not connected') + if processor_name not in self.processing_workers_list: + self.check_if_queue_exists(processor_name) job = DBProcessorJob( - **data.dict(exclude_unset=True, exclude_none=True), + **job_input.dict(exclude_unset=True, exclude_none=True), job_id=generate_id(), processor_name=processor_name, state=StateEnum.queued @@ -318,7 +321,6 @@ async def push_to_processing_queue(self, processor_name: str, data: PYJobInput) await job.insert() processing_message = self.create_processing_message(job) encoded_processing_message = OcrdProcessingMessage.encode_yml(processing_message) - try: self.rmq_publisher.publish_to_queue(processor_name, encoded_processing_message) except Exception as error: @@ -329,71 +331,18 @@ async def push_to_processing_queue(self, processor_name: str, data: PYJobInput) ) return job.to_job_output() - async def push_to_processor_server(self, processor_name: str, data: PYJobInput) -> PYJobOutput: - # Validate existence of the Workspace in the DB - if bool(data.path_to_mets) == bool(data.workspace_id): - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail="Either 'path' or 'workspace_id' must be provided, but not both" - ) - # This check is done to return early in case - # the workspace_id is provided but not existing in the DB - elif data.workspace_id: - try: - await db_get_workspace(data.workspace_id) - except ValueError: - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Workspace with id '{data.workspace_id}' not existing" - ) - - processor_server_url = None - - # Check if a processor server with processor_name was deployed - # TODO: Revisit when the config file classes are refactored (made more abstract). - # This is such a mess now due to the bad abstraction and bad naming conventions! - for host_config in self.config.hosts: - for processor_server in host_config.servers: - if processor_server.name == processor_name: - processor_server_url = f"http://{host_config.address}:{processor_server.port}/" - - if not processor_server_url: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Processor Server of '{processor_name}' is not available" - ) - - # Request the tool json from the Processor Server - response = requests.get( - processor_server_url, - headers={'Content-Type': 'application/json'} - ) - if not response.status_code == 200: - self.log.exception(f"Failed to retrieve '{processor_name}' from: {processor_server_url}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Failed to retrieve '{processor_name}' from: {processor_server_url}" - ) - ocrd_tool = response.json() - if not ocrd_tool: - self.log.exception(f"Failed to retrieve ocrd tool json of '{processor_name}' from: {processor_server_url}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Failed to retrieve ocrd tool json of '{processor_name}' from: {processor_server_url}" - ) - report = ParameterValidator(ocrd_tool).validate(dict(data.parameters)) - if not report.is_valid: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) - + async def push_to_processor_server(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput: + ocrd_tool, processor_server_url = self.query_ocrd_tool_json_from_server(processor_name) + validate_job_input(self.log, processor_name, ocrd_tool, job_input) + job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=False) try: - json_data = json.dumps(data.dict(exclude_unset=True, exclude_none=True)) + json_data = json.dumps(job_input.dict(exclude_unset=True, exclude_none=True)) except Exception as e: self.log.exception(f"Failed to json dump the PYJobInput, error: {e}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to json dump the PYJobInput, error: {e}" ) - # Post a processing job to the Processor Server response = requests.post( processor_server_url, @@ -406,10 +355,12 @@ async def push_to_processor_server(self, processor_name: str, data: PYJobInput) status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to post '{processor_name}' job to: {processor_server_url}" ) - job_output = response.json() return job_output + async def get_processor_job(self, processor_name: str, job_id: str) -> PYJobOutput: + return await _get_processor_job(self.log, processor_name, job_id) + async def get_processor_info(self, processor_name) -> Dict: """ Return a processor's ocrd-tool.json """ @@ -420,18 +371,6 @@ async def get_processor_info(self, processor_name) -> Dict: ) return get_ocrd_tool_json(processor_name) - async def get_job(self, processor_name: str, job_id: str) -> PYJobOutput: - """ Return processing job-information from the database - """ - try: - job = await db_get_processing_job(job_id) - return job.to_job_output() - except ValueError: - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Processing job with id '{job_id}' of processor type '{processor_name}' not existing" - ) - async def list_processors(self) -> List[str]: """ Return a list of all available processors """ @@ -440,8 +379,8 @@ async def list_processors(self) -> List[str]: # TODO: 1) Revisit this. Currently, it adds labels in # front of the names for differentiation purposes # TODO: 2) This could be optimized by holding a dynamic list - for worker_name in self._processing_workers_list: + for worker_name in self.processing_workers_list: processor_names_list.append(f'worker {worker_name}') - for server_name in self._processor_servers_list: + for server_name in self.processor_servers_list: processor_names_list.append(f'server {server_name}') return processor_names_list diff --git a/ocrd_network/ocrd_network/processing_worker.py b/ocrd_network/ocrd_network/processing_worker.py index 8b3681f18f..74d21b1f92 100644 --- a/ocrd_network/ocrd_network/processing_worker.py +++ b/ocrd_network/ocrd_network/processing_worker.py @@ -9,18 +9,14 @@ """ from datetime import datetime -import json import logging -from os import environ, getpid +from os import getpid import requests -from typing import Any, List import pika.spec import pika.adapters.blocking_connection -from ocrd import Resolver from ocrd_utils import getLogger -from ocrd.processor.helpers import run_cli, run_processor from .database import ( sync_initiate_database, @@ -28,6 +24,7 @@ sync_db_update_processing_job, ) from .models import StateEnum +from .process_helpers import run_single_execution from .rabbitmq_utils import ( OcrdProcessingMessage, OcrdResultMessage, @@ -36,21 +33,13 @@ ) from .utils import ( calculate_execution_time, + tf_disable_interactive_logs, verify_database_uri, verify_and_parse_mq_uri ) # TODO: Check this again when the logging is refactored -try: - # This env variable must be set before importing from Keras - environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - from tensorflow.keras.utils import disable_interactive_logging - # Enabled interactive logging throws an exception - # due to a call of sys.stdout.flush() - disable_interactive_logging() -except Exception: - # Nothing should be handled here if TF is not available - pass +tf_disable_interactive_logs() class ProcessingWorker: @@ -83,7 +72,7 @@ def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, self.processor_name = processor_name # The processor class to be used to instantiate the processor # Think of this as a func pointer to the constructor of the respective OCR-D processor - self.processor_class = processor_class + self.ProcessorClass = processor_class # Gets assigned when `connect_consumer` is called on the worker object # Used to consume OcrdProcessingMessage from the queue with name {processor_name} self.rmq_consumer = None @@ -192,20 +181,21 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: # may not contain certain keys. Simply passing None in the OcrdProcessingMessage constructor # breaks the message validator schema which expects String, but not None due to the Optional[] wrapper. pm_keys = processing_message.__dict__.keys() + job_id = processing_message.job_id + input_file_grps = processing_message.input_file_grps output_file_grps = processing_message.output_file_grps if 'output_file_grps' in pm_keys else None path_to_mets = processing_message.path_to_mets if 'path_to_mets' in pm_keys else None workspace_id = processing_message.workspace_id if 'workspace_id' in pm_keys else None page_id = processing_message.page_id if 'page_id' in pm_keys else None result_queue_name = processing_message.result_queue_name if 'result_queue_name' in pm_keys else None callback_url = processing_message.callback_url if 'callback_url' in pm_keys else None + parameters = processing_message.parameters if processing_message.parameters else {} if not path_to_mets and workspace_id: path_to_mets = sync_db_get_workspace(workspace_id).workspace_mets_path - workspace = Resolver().workspace_from_url(path_to_mets) - - job_id = processing_message.job_id - + execution_failed = False + self.log.debug(f'Invoking processor: {self.processor_name}') start_time = datetime.now() sync_db_update_processing_job( job_id=job_id, @@ -213,35 +203,30 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: path_to_mets=path_to_mets, start_time=start_time ) - if self.processor_class: - self.log.debug(f'Invoking the pythonic processor: {self.processor_name}') - return_status = self.run_processor_from_worker( - processor_class=self.processor_class, - workspace=workspace, - page_id=page_id, - input_file_grps=processing_message.input_file_grps, - output_file_grps=output_file_grps, - parameter=processing_message.parameters - ) - else: - self.log.debug(f'Invoking the cli: {self.processor_name}') - return_status = self.run_cli_from_worker( + try: + run_single_execution( + ProcessorClass=self.ProcessorClass, executable=self.processor_name, - workspace=workspace, - page_id=page_id, - input_file_grps=processing_message.input_file_grps, + abs_path_to_mets=path_to_mets, + input_file_grps=input_file_grps, output_file_grps=output_file_grps, - parameter=processing_message.parameters + page_id=page_id, + parameters=processing_message.parameters ) + except Exception as error: + self.log.debug(f"processor_name: {self.processor_name}, path_to_mets: {path_to_mets}, " + f"input_grps: {input_file_grps}, output_file_grps: {output_file_grps}, " + f"page_id: {page_id}, parameters: {parameters}") + self.log.error(error) + execution_failed = True end_time = datetime.now() - # Execution duration in ms - execution_duration = calculate_execution_time(start_time, end_time) - job_state = StateEnum.success if return_status else StateEnum.failed + exec_duration = calculate_execution_time(start_time, end_time) + job_state = StateEnum.success if not execution_failed else StateEnum.failed sync_db_update_processing_job( job_id=job_id, state=job_state, end_time=end_time, - exec_time=f'{execution_duration} ms' + exec_time=f'{exec_duration} ms' ) if result_queue_name or callback_url: @@ -253,11 +238,9 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: workspace_id=workspace_id ) self.log.info(f'Result message: {result_message}') - # If the result_queue field is set, send the result message to a result queue if result_queue_name: self.publish_to_result_queue(result_queue_name, result_message) - # If the callback_url field is set, post the result message to a callback url if callback_url: self.post_to_callback_url(callback_url, result_message) @@ -286,65 +269,3 @@ def post_to_callback_url(self, callback_url: str, result_message: OcrdResultMess } response = requests.post(url=callback_url, headers=headers, json=json_data) self.log.info(f'Response from callback_url "{response}"') - - def run_processor_from_worker( - self, - processor_class, - workspace, - page_id: str, - input_file_grps: List[str], - output_file_grps: List[str], - parameter: dict, - ) -> bool: - input_file_grps_str = ','.join(input_file_grps) - output_file_grps_str = ','.join(output_file_grps) - - success = True - try: - run_processor( - processorClass=processor_class, - workspace=workspace, - page_id=page_id, - parameter=parameter, - input_file_grp=input_file_grps_str, - output_file_grp=output_file_grps_str, - instance_caching=True - ) - except Exception as e: - success = False - self.log.exception(e) - - if not success: - self.log.error(f'{processor_class} failed with an exception.') - else: - self.log.debug(f'{processor_class} exited with success.') - return success - - def run_cli_from_worker( - self, - executable: str, - workspace, - page_id: str, - input_file_grps: List[str], - output_file_grps: List[str], - parameter: dict - ) -> bool: - input_file_grps_str = ','.join(input_file_grps) - output_file_grps_str = ','.join(output_file_grps) - - return_code = run_cli( - executable=executable, - workspace=workspace, - page_id=page_id, - input_file_grp=input_file_grps_str, - output_file_grp=output_file_grps_str, - parameter=json.dumps(parameter), - mets_url=workspace.mets_target - ) - - if return_code != 0: - self.log.error(f'{executable} exited with non-zero return value {return_code}.') - return False - else: - self.log.debug(f'{executable} exited with success.') - return True diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 70e7529285..9d8cf1af12 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -1,25 +1,14 @@ from datetime import datetime -import json import logging -from os import environ, getpid +from os import getpid from subprocess import run, PIPE -from typing import List import uvicorn from fastapi import FastAPI, HTTPException, status, BackgroundTasks -from ocrd.processor.helpers import run_cli, run_processor -from ocrd import Resolver -from ocrd_validators import ParameterValidator -from ocrd_utils import ( - getLogger, - get_ocrd_tool_json -) - +from ocrd_utils import getLogger, get_ocrd_tool_json from .database import ( DBProcessorJob, - db_get_processing_job, - db_get_workspace, db_update_processing_job, initiate_database ) @@ -29,19 +18,20 @@ PYOcrdTool, StateEnum ) -from .utils import calculate_execution_time, generate_id +from .process_helpers import run_single_execution +from .server_utils import ( + _get_processor_job, + validate_and_resolve_mets_path, + validate_job_input +) +from .utils import ( + calculate_execution_time, + generate_id, + tf_disable_interactive_logs +) # TODO: Check this again when the logging is refactored -try: - # This env variable must be set before importing from Keras - environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - from tensorflow.keras.utils import disable_interactive_logging - # Enabled interactive logging throws an exception - # due to a call of sys.stdout.flush() - disable_interactive_logging() -except Exception: - # Nothing should be handled here if TF is not available - pass +tf_disable_interactive_logs() class ProcessorServer(FastAPI): @@ -107,7 +97,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= self.router.add_api_route( path='/{job_id}', - endpoint=self.get_job, + endpoint=self.get_processor_job, methods=['GET'], tags=['Processing'], status_code=status.HTTP_200_OK, @@ -131,79 +121,54 @@ async def get_processor_info(self): # Note: The Processing server pushes to a queue, while # the Processor Server creates (pushes to) a background task - async def create_processor_job_task(self, data: PYJobInput, background_tasks: BackgroundTasks): - if not self.ocrd_tool: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f'Empty or missing ocrd_tool' - ) - report = ParameterValidator(self.ocrd_tool).validate(dict(data.parameters)) - if not report.is_valid: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) - - if bool(data.path_to_mets) == bool(data.workspace_id): - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail="Either 'path' or 'workspace_id' must be provided, but not both" - ) - - # This check is done to return early in case - # the workspace_id is provided but not existing in the DB - elif data.workspace_id: - try: - await db_get_workspace(data.workspace_id) - except ValueError: - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Workspace with id '{data.workspace_id}' not existing" - ) + async def create_processor_job_task(self, job_input: PYJobInput, background_tasks: BackgroundTasks): + validate_job_input(self.log, self.processor_name, self.ocrd_tool, job_input) + job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=True) + job_id = generate_id() job = DBProcessorJob( - **data.dict(exclude_unset=True, exclude_none=True), - job_id=generate_id(), + **job_input.dict(exclude_unset=True, exclude_none=True), + job_id=job_id, processor_name=self.processor_name, state=StateEnum.queued ) await job.insert() - if self.ProcessorClass: - # Run the processor in the background - background_tasks.add_task( - self.run_processor_from_server, - job_id=job.job_id, - workspace_id=data.workspace_id, - path_to_mets=data.path_to_mets, - page_id=data.page_id, - parameters=data.parameters, - input_file_grps=data.input_file_grps, - output_file_grps=data.output_file_grps, - ) - else: - # Run the CLI in the background + execution_failed = False + start_time = datetime.now() + await db_update_processing_job( + job_id=job_id, + state=StateEnum.running, + start_time=start_time + ) + try: background_tasks.add_task( - self.run_cli_from_server, - job_id=job.job_id, - workspace_id=data.workspace_id, - path_to_mets=data.path_to_mets, - page_id=data.page_id, - input_file_grps=data.input_file_grps, - output_file_grps=data.output_file_grps, - parameters=data.parameters + run_single_execution, + ProcessorClass=self.ProcessorClass, + executable=self.processor_name, + abs_path_to_mets=job.path_to_mets, + input_file_grps=job.input_file_grps, + output_file_grps=job.output_file_grps, + page_id=job.page_id, + parameters=job.parameters ) + except Exception as error: + self.log.debug(f"processor_name: {self.processor_name}, path_to_mets: {job.path_to_mets}, " + f"input_grps: {job.input_file_grps}, output_file_grps: {job.output_file_grps}, " + f"page_id: {job.page_id}, parameters: {job.parameters}") + self.log.error(error) + execution_failed = True + end_time = datetime.now() + exec_duration = calculate_execution_time(start_time, end_time) + job_state = StateEnum.success if not execution_failed else StateEnum.failed + await db_update_processing_job( + job_id=job_id, + state=job_state, + end_time=end_time, + exec_time=f'{exec_duration} ms' + ) return job.to_job_output() - async def get_job(self, processor_name: str, job_id: str) -> PYJobOutput: - """ Return processing job-information from the database - """ - try: - job = await db_get_processing_job(job_id) - return job.to_job_output() - except ValueError: - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f"Processing job with id '{job_id}' of processor type '{processor_name}' not existing" - ) - def get_ocrd_tool(self): if self.ocrd_tool: return self.ocrd_tool @@ -236,108 +201,5 @@ def run_server(self, host, port, access_log=False): self.log.addHandler(file_handler) uvicorn.run(self, host=host, port=port, access_log=access_log) - async def run_cli_from_server( - self, - job_id: str, - processor_name: str, - workspace_id: str, - path_to_mets: str, - input_file_grps: List[str], - output_file_grps: List[str], - page_id: str, - parameters: dict - ): - log = getLogger('ocrd.processor.helpers.run_cli_from_api') - - # Turn input/output file groups into a comma separated string - input_file_grps_str = ','.join(input_file_grps) - output_file_grps_str = ','.join(output_file_grps) - - if not path_to_mets and workspace_id: - path_to_mets = await db_get_workspace(workspace_id).workspace_mets_path - workspace = Resolver().workspace_from_url(path_to_mets) - - start_time = datetime.now() - await db_update_processing_job( - job_id=job_id, - state=StateEnum.running, - start_time=start_time - ) - # Execute the processor - return_code = run_cli( - executable=processor_name, - workspace=workspace, - page_id=page_id, - input_file_grp=input_file_grps_str, - output_file_grp=output_file_grps_str, - parameter=json.dumps(parameters), - mets_url=workspace.mets_target - ) - end_time = datetime.now() - # Execution duration in ms - execution_duration = calculate_execution_time(start_time, end_time) - - if return_code != 0: - job_state = StateEnum.failed - log.error(f'{self.processor_name} exited with non-zero return value {return_code}.') - else: - job_state = StateEnum.success - - await db_update_processing_job( - job_id=job_id, - state=job_state, - end_time=end_time, - exec_time=f'{execution_duration} ms' - ) - - async def run_processor_from_server( - self, - job_id: str, - workspace_id: str, - path_to_mets: str, - input_file_grps: List[str], - output_file_grps: List[str], - page_id: str, - parameters: dict, - ): - log = getLogger('ocrd.processor.helpers.run_processor_from_api') - - # Turn input/output file groups into a comma separated string - input_file_grps_str = ','.join(input_file_grps) - output_file_grps_str = ','.join(output_file_grps) - - if not path_to_mets and workspace_id: - path_to_mets = await db_get_workspace(workspace_id).workspace_mets_path - workspace = Resolver().workspace_from_url(path_to_mets) - - is_success = True - start_time = datetime.now() - await db_update_processing_job( - job_id=job_id, - state=StateEnum.running, - start_time=start_time - ) - try: - run_processor( - processorClass=self.ProcessorClass, - workspace=workspace, - page_id=page_id, - parameter=parameters, - input_file_grp=input_file_grps_str, - output_file_grp=output_file_grps_str, - instance_caching=True - ) - except Exception as e: - is_success = False - log.exception(e) - - end_time = datetime.now() - # Execution duration in ms - execution_duration = calculate_execution_time(start_time, end_time) - job_state = StateEnum.success if is_success else StateEnum.failed - await db_update_processing_job( - job_id=job_id, - state=job_state, - end_time=end_time, - exec_time=f'{execution_duration} ms' - ) + async def get_processor_job(self, processor_name: str, job_id: str) -> PYJobOutput: + return await _get_processor_job(self.log, processor_name, job_id) diff --git a/ocrd_network/ocrd_network/server_utils.py b/ocrd_network/ocrd_network/server_utils.py new file mode 100644 index 0000000000..b117cb48bd --- /dev/null +++ b/ocrd_network/ocrd_network/server_utils.py @@ -0,0 +1,67 @@ +from fastapi import FastAPI, HTTPException, status, BackgroundTasks +from ocrd_validators import ParameterValidator +from .database import ( + db_get_processing_job, + db_get_workspace, +) +from .models import PYJobInput, PYJobOutput + + +async def _get_processor_job(logger, processor_name: str, job_id: str) -> PYJobOutput: + """ Return processing job-information from the database + """ + try: + job = await db_get_processing_job(job_id) + return job.to_job_output() + except ValueError as e: + logger.exception(f"Processing job with id '{job_id}' of processor type " + f"'{processor_name}' not existing, error: {e}") + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Processing job with id '{job_id}' of processor type '{processor_name}' not existing" + ) + + +async def validate_and_resolve_mets_path(logger, job_input: PYJobInput, resolve: bool = False) -> PYJobInput: + # This check is done to return early in case the workspace_id is provided + # but the abs mets path cannot be queried from the DB + if not job_input.path_to_mets and job_input.workspace_id: + try: + db_workspace = await db_get_workspace(job_input.workspace_id) + if resolve: + job_input.path_to_mets = db_workspace.workspace_mets_path + except ValueError as e: + logger.exception(f"Workspace with id '{job_input.workspace_id}' not existing: {e}") + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Workspace with id '{job_input.workspace_id}' not existing" + ) + return job_input + + +def validate_job_input(logger, processor_name: str, ocrd_tool: dict, job_input: PYJobInput) -> None: + if bool(job_input.path_to_mets) == bool(job_input.workspace_id): + logger.exception("Either 'path' or 'workspace_id' must be provided, but not both") + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail="Either 'path' or 'workspace_id' must be provided, but not both" + ) + if not ocrd_tool: + logger.exception(f"Processor '{processor_name}' not available. Empty or missing ocrd_tool") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Processor '{processor_name}' not available. Empty or missing ocrd_tool" + ) + try: + report = ParameterValidator(ocrd_tool).validate(dict(job_input.parameters)) + except Exception as e: + logger.exception(f'Failed to validate processing job against the ocrd_tool: {e}') + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f'Failed to validate processing job against the ocrd_tool' + ) + else: + if not report.is_valid: + logger.exception(f'Failed to validate processing job ' + f'against the ocrd_tool, errors: {report.errors}') + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=report.errors) diff --git a/ocrd_network/ocrd_network/utils.py b/ocrd_network/ocrd_network/utils.py index 759a31597a..568544c67f 100644 --- a/ocrd_network/ocrd_network/utils.py +++ b/ocrd_network/ocrd_network/utils.py @@ -1,6 +1,7 @@ from datetime import datetime from functools import wraps from re import match as re_match +from os import environ from pika import URLParameters from pymongo import uri_parser as mongo_uri_parser from uuid import uuid4 @@ -27,6 +28,19 @@ def calculate_execution_time(start: datetime, end: datetime) -> int: return int((end - start).total_seconds() * 1000) +def tf_disable_interactive_logs(): + try: + # This env variable must be set before importing from Keras + environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + from tensorflow.keras.utils import disable_interactive_logging + # Enabled interactive logging throws an exception + # due to a call of sys.stdout.flush() + disable_interactive_logging() + except Exception: + # Nothing should be handled here if TF is not available + pass + + def generate_created_time() -> int: return int(datetime.utcnow().timestamp()) From 498ed40f0c7cddd2692809176710f408eef82483 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Sat, 1 Apr 2023 02:08:54 +0200 Subject: [PATCH 12/34] refactoring: second portion --- ocrd_network/ocrd_network/deployer.py | 569 ++++++++++-------- .../ocrd_network/deployment_config.py | 103 ---- ocrd_network/ocrd_network/deployment_utils.py | 92 +-- .../ocrd_network/processing_server.py | 164 +++-- ocrd_network/ocrd_network/runtime_data.py | 122 ++++ ocrd_network/ocrd_network/utils.py | 27 +- 6 files changed, 585 insertions(+), 492 deletions(-) delete mode 100644 ocrd_network/ocrd_network/deployment_config.py create mode 100644 ocrd_network/ocrd_network/runtime_data.py diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index c0b8c39edb..2c9af781b9 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -6,48 +6,109 @@ Each Processing Host may have several Processing Workers. Each Processing Worker is an instance of an OCR-D processor. """ - from __future__ import annotations -from typing import Dict, Union -from paramiko import SSHClient +from typing import Dict, List, Union from re import search as re_search from os import getpid from time import sleep - from ocrd_utils import getLogger -from .deployment_config import * + from .deployment_utils import ( create_docker_client, - create_ssh_client, - CustomDockerClient, - DeployType, - HostData, + wait_for_rabbitmq_availability ) -from .rabbitmq_utils import RMQPublisher +from .runtime_data import ( + DataHost, + DataMongoDB, + DataProcessingWorker, + DataProcessorServer, + DataRabbitMQ +) +from .utils import validate_and_load_config -class Deployer: - """Wraps the deployment functionality of the Processing Server - - Deployer is the one acting. - :py:attr:`config` is for representation of the config file only. - :py:attr:`hosts` is for managing processor information, not for actually processing. - """ - def __init__(self, config: ProcessingServerConfig) -> None: - """ - Args: - config (:py:class:`ProcessingServerConfig`): parsed configuration of the Processing Server - """ +class Deployer: + def __init__(self, config_path: str) -> None: self.log = getLogger(__name__) - self.config = config - self.hosts = HostData.from_config(config.hosts) - self.mongo_pid = None - self.mq_pid = None + config = validate_and_load_config(config_path) + + self.data_mongo: DataMongoDB = DataMongoDB(config['database']) + self.data_queue: DataRabbitMQ = DataRabbitMQ(config['process_queue']) + self.data_hosts: List[DataHost] = [] + for config_host in config['hosts']: + self.data_hosts.append(DataHost(config_host)) + + # TODO: Reconsider this. + def find_matching_processors( + self, + worker_only: bool = False, + server_only: bool = False, + docker_only: bool = False, + native_only: bool = False, + str_names_only: bool = False, + unique_only: bool = False + ) -> Union[List[str], List[object]]: + """Finds and returns a list of matching data objects of type: + `DataProcessingWorker` and `DataProcessorServer`. + + :py:attr:`worker_only` match only processors with worker status + :py:attr:`server_only` match only processors with server status + :py:attr:`docker_only` match only docker processors + :py:attr:`native_only` match only native processors + :py:attr:`str_only` returns the processor_name instead of data object + :py:attr:`unique_only` remove duplicates from the matches + + `worker_only` and `server_only` are mutually exclusive to each other + `docker_only` and `native_only` are mutually exclusive to each other + `unique_only` is allowed only together with `str_names_only` + """ + + if worker_only and server_only: + raise ValueError(f"Only 'worker_only' or 'server_only' is allowed, not both.") + if docker_only and native_only: + raise ValueError(f"Only 'docker_only' or 'native_only' is allowed, not both.") + if not str_names_only and unique_only: + raise ValueError(f"Value 'unique_only' is allowed only together with 'str_names_only'") + + # Find all matching objects of type: + # DataProcessingWorker or DataProcessorServer + matched_objects = [] + for data_host in self.data_hosts: + if not server_only: + for data_worker in data_host.data_workers: + if data_worker.deploy_type == 'native' and docker_only: + continue + if data_worker.deploy_type == 'docker' and native_only: + continue + matched_objects.append(data_worker) + if not worker_only: + for data_server in data_host.data_servers: + if data_server.deploy_type == 'native' and docker_only: + continue + if data_server.deploy_type == 'docker' and native_only: + continue + matched_objects.append(data_server) + if str_names_only: + # gets only the processor names of the matched objects + name_list = [match.processor_name for match in matched_objects] + if unique_only: + # removes the duplicates, if any + return list(dict.fromkeys(name_list)) + return name_list + return matched_objects + + def resolve_processor_server_url(self, processor_name) -> str: + processor_server_url = '' + for data_host in self.data_hosts: + for data_server in data_host.data_servers: + if data_server.processor_name == processor_name: + processor_server_url = f'http://{data_host.address}:{data_server.port}/' + return processor_server_url def kill_all(self) -> None: - """ kill all started services: workers, database, queue + """ kill all started services: hosts, database, queue The order of killing is important to optimize graceful shutdown in the future. If RabbitMQ server is killed before killing Processing Workers, that may have bad outcome and leave @@ -57,128 +118,136 @@ def kill_all(self) -> None: self.kill_mongodb() self.kill_rabbitmq() - def deploy_hosts(self, rabbitmq_url: str, mongodb_url: str) -> None: - for host in self.hosts: - self.log.debug(f'Deploying processing workers on host: {host.config.address}') - - if (any(p.deploy_type == DeployType.native for p in host.config.processors) - and not host.ssh_client): - host.ssh_client = create_ssh_client( - host.config.address, - host.config.username, - host.config.password, - host.config.keypath - ) - if (any(p.deploy_type == DeployType.docker for p in host.config.processors) - and not host.docker_client): - host.docker_client = create_docker_client( - host.config.address, - host.config.username, - host.config.password, - host.config.keypath + def deploy_hosts( + self, + mongodb_url: str, + rabbitmq_url: str + ) -> None: + for host_data in self.data_hosts: + if host_data.needs_ssh: + host_data.create_client(client_type='ssh') + assert host_data.ssh_client + if host_data.needs_docker: + host_data.create_client(client_type='docker') + assert host_data.docker_client + + self.log.debug(f'Deploying processing workers on host: {host_data.address}') + for data_worker in host_data.data_workers: + self._deploy_processing_worker( + mongodb_url, + rabbitmq_url, + host_data, + data_worker ) - for processor in host.config.processors: - self._deploy_processing_worker(processor, host, rabbitmq_url, mongodb_url) - - # TODO: This is not optimal - the entire method should be refactored! - if (any(s.deploy_type == DeployType.native for s in host.config.servers) - and not host.ssh_client): - host.ssh_client = create_ssh_client( - host.config.address, - host.config.username, - host.config.password, - host.config.keypath - ) - if (any(s.deploy_type == DeployType.docker for s in host.config.servers) - and not host.docker_client): - host.docker_client = create_docker_client( - host.config.address, - host.config.username, - host.config.password, - host.config.keypath + self.log.debug(f'Deploying processor servers on host: {host_data.address}') + for data_server in host_data.data_servers: + self._deploy_processor_server( + mongodb_url, + host_data, + data_server ) - for server in host.config.servers: - self._deploy_processor_server(server, host, mongodb_url) - - if host.ssh_client: - host.ssh_client.close() - if host.docker_client: - host.docker_client.close() - - def _deploy_processing_worker(self, processor: WorkerConfig, host: HostData, - rabbitmq_url: str, mongodb_url: str) -> None: - self.log.debug(f"deploy '{processor.deploy_type}' processing worker: '{processor.name}' on '{host.config.address}'") - - for _ in range(processor.count): - if processor.deploy_type == DeployType.native: - assert host.ssh_client # to satisfy mypy - pid = self.start_native_processor( - client=host.ssh_client, - processor_name=processor.name, - queue_url=rabbitmq_url, - database_url=mongodb_url, - ) - host.pids_native.append(pid) - else: - assert processor.deploy_type == DeployType.docker - assert host.docker_client # to satisfy mypy - pid = self.start_docker_processor( - client=host.docker_client, - processor_name=processor.name, - queue_url=rabbitmq_url, - database_url=mongodb_url - ) - host.pids_docker.append(pid) - sleep(0.1) + if host_data.ssh_client: + host_data.ssh_client.close() + host_data.ssh_client = None + if host_data.docker_client: + host_data.docker_client.close() + host_data.docker_client = None + + def _deploy_processing_worker( + self, + mongodb_url: str, + rabbitmq_url: str, + host_data: DataHost, + data_worker: DataProcessingWorker + ) -> None: + self.log.debug(f"Deploying processing worker, " + f"environment: '{data_worker.deploy_type}', " + f"name: '{data_worker.processor_name}', " + f"address: '{host_data.address}'") + + if data_worker.deploy_type == 'native': + assert host_data.ssh_client # to satisfy mypy + pid = self.start_native_processor( + ssh_client=host_data.ssh_client, + processor_name=data_worker.processor_name, + queue_url=rabbitmq_url, + database_url=mongodb_url, + ) + data_worker.pid = pid + else: + assert data_worker.deploy_type == 'docker' + assert host_data.docker_client # to satisfy mypy + pid = self.start_docker_processor( + docker_client=host_data.docker_client, + processor_name=data_worker.processor_name, + _queue_url=rabbitmq_url, + _database_url=mongodb_url + ) + data_worker.pid = pid + sleep(0.2) # TODO: Revisit this to remove code duplications of deploy_* methods - def _deploy_processor_server(self, server: ProcessorServerConfig, host: HostData, mongodb_url: str) -> None: - self.log.debug(f"deploy '{server.deploy_type}' processor server: '{server.name}' on '{host.config.address}'") - if server.deploy_type == DeployType.native: - assert host.ssh_client + def _deploy_processor_server( + self, + mongodb_url: str, + host_data: DataHost, + data_server: DataProcessorServer, + ) -> None: + self.log.debug(f"Deploying processing worker, " + f"environment: '{data_server.deploy_type}', " + f"name: '{data_server.processor_name}', " + f"address: '{data_server.host}:{data_server.port}'") + + if data_server.deploy_type == 'native': + assert host_data.ssh_client pid = self.start_native_processor_server( - client=host.ssh_client, - processor_name=server.name, - agent_address=f'{host.config.address}:{server.port}', + ssh_client=host_data.ssh_client, + processor_name=data_server.processor_name, + agent_address=f'{data_server.host}:{data_server.port}', database_url=mongodb_url, ) - host.processor_server_pids_native.append(pid) + data_server.pid = pid - if server.name in host.processor_server_ports: - if host.processor_server_ports[server.name]: - host.processor_server_ports[server.name] = host.processor_server_ports[server.name].append(server.port) + if data_server.processor_name in host_data.server_ports: + name = data_server.processor_name + port = data_server.port + if host_data.server_ports[name]: + host_data.server_ports[name] = host_data.server_ports[name].append(port) else: - host.processor_server_ports[server.name] = [server.port] + host_data.server_ports[name] = [port] else: - host.processor_server_ports[server.name] = [server.port] + host_data.server_ports[data_server.processor_name] = [data_server.port] else: raise Exception("Deploying docker processor server is not supported yet!") - def deploy_rabbitmq(self, image: str, detach: bool, remove: bool, - ports_mapping: Union[Dict, None] = None) -> str: - """Start docker-container with rabbitmq - - This method deploys the RabbitMQ Server. Handling of creation of queues, submitting messages - to queues, and receiving messages from queues is part of the RabbitMQ Library which is part - of the OCR-D WebAPI implementation. - """ + def deploy_rabbitmq( + self, + image: str, + detach: bool, + remove: bool, + ports_mapping: Union[Dict, None] = None + ) -> str: self.log.debug(f"Trying to deploy '{image}', with modes: " f"detach='{detach}', remove='{remove}'") - if not self.config or not self.config.queue.address: + if not self.data_queue or not self.data_queue.address: raise ValueError('Deploying RabbitMQ has failed - missing configuration.') - client = create_docker_client(self.config.queue.address, self.config.queue.username, - self.config.queue.password, self.config.queue.keypath) + client = create_docker_client( + self.data_queue.address, + self.data_queue.ssh_username, + self.data_queue.ssh_password, + self.data_queue.ssh_keypath + ) if not ports_mapping: # 5672, 5671 - used by AMQP 0-9-1 and AMQP 1.0 clients without and with TLS # 15672, 15671: HTTP API clients, management UI and rabbitmq admin, without and with TLS # 25672: used for internode and CLI tools communication and is allocated from # a dynamic range (limited to a single port by default, computed as AMQP port + 20000) ports_mapping = { - 5672: self.config.queue.port, + 5672: self.data_queue.port, 15672: 15672, 25672: 25672 } @@ -189,61 +258,52 @@ def deploy_rabbitmq(self, image: str, detach: bool, remove: bool, ports=ports_mapping, # The default credentials to be used by the processing workers environment=[ - f'RABBITMQ_DEFAULT_USER={self.config.queue.credentials[0]}', - f'RABBITMQ_DEFAULT_PASS={self.config.queue.credentials[1]}' + f'RABBITMQ_DEFAULT_USER={self.data_queue.username}', + f'RABBITMQ_DEFAULT_PASS={self.data_queue.password}' ] ) assert res and res.id, \ - f'Failed to start RabbitMQ docker container on host: {self.config.mongo.address}' - self.mq_pid = res.id + f'Failed to start RabbitMQ docker container on host: {self.data_queue.address}' + self.data_queue.pid = res.id client.close() - # Build the RabbitMQ Server URL to return - rmq_host = self.config.queue.address - # note, integer validation is already performed - rmq_port = int(self.config.queue.port) - # the default virtual host since no field is - # provided in the processing server config.yml + rmq_host = self.data_queue.address + rmq_port = int(self.data_queue.port) rmq_vhost = '/' - self.wait_for_rabbitmq_availability(rmq_host, rmq_port, rmq_vhost, - self.config.queue.credentials[0], - self.config.queue.credentials[1]) - - rabbitmq_hostinfo = f'{rmq_host}:{rmq_port}{rmq_vhost}' - self.log.info(f'The RabbitMQ server was deployed on host: {rabbitmq_hostinfo}') - return rabbitmq_hostinfo - - def wait_for_rabbitmq_availability(self, host: str, port: int, vhost: str, username: str, - password: str) -> None: - max_waiting_steps = 15 - while max_waiting_steps > 0: - try: - dummy_publisher = RMQPublisher(host=host, port=port, vhost=vhost) - dummy_publisher.authenticate_and_connect(username=username, password=password) - except Exception: - max_waiting_steps -= 1 - sleep(2) - else: - # TODO: Disconnect the dummy_publisher here before returning... - return - raise RuntimeError('Error waiting for queue startup: timeout exceeded') - - def deploy_mongodb(self, image: str, detach: bool, remove: bool, - ports_mapping: Union[Dict, None] = None) -> str: - """ Start mongodb in docker - """ + wait_for_rabbitmq_availability( + host=rmq_host, + port=rmq_port, + vhost=rmq_vhost, + username=self.data_queue.username, + password=self.data_queue.password + ) + self.log.info(f'The RabbitMQ server was deployed on URL: ' + f'{rmq_host}:{rmq_port}{rmq_vhost}') + return self.data_queue.url + + def deploy_mongodb( + self, + image: str, + detach: bool, + remove: bool, + ports_mapping: Union[Dict, None] = None + ) -> str: self.log.debug(f"Trying to deploy '{image}', with modes: " f"detach='{detach}', remove='{remove}'") - if not self.config or not self.config.mongo.address: + if not self.data_mongo or not self.data_mongo.address: raise ValueError('Deploying MongoDB has failed - missing configuration.') - client = create_docker_client(self.config.mongo.address, self.config.mongo.username, - self.config.mongo.password, self.config.mongo.keypath) + client = create_docker_client( + self.data_mongo.address, + self.data_mongo.ssh_username, + self.data_mongo.ssh_password, + self.data_mongo.ssh_keypath + ) if not ports_mapping: ports_mapping = { - 27017: self.config.mongo.port + 27017: self.data_mongo.port } res = client.containers.run( image=image, @@ -253,92 +313,114 @@ def deploy_mongodb(self, image: str, detach: bool, remove: bool, ) if not res or not res.id: raise RuntimeError('Failed to start MongoDB docker container on host: ' - f'{self.config.mongo.address}') - self.mongo_pid = res.id + f'{self.data_mongo.address}') + self.data_mongo.pid = res.id client.close() - mongodb_hostinfo = f'{self.config.mongo.address}:{self.config.mongo.port}' + mongodb_hostinfo = f'{self.data_mongo.address}:{self.data_mongo.port}' self.log.info(f'The MongoDB was deployed on host: {mongodb_hostinfo}') return mongodb_hostinfo def kill_rabbitmq(self) -> None: - if not self.mq_pid: + if not self.data_queue.pid: self.log.warning('No running RabbitMQ instance found') return - client = create_docker_client(self.config.queue.address, self.config.queue.username, - self.config.queue.password, self.config.queue.keypath) - client.containers.get(self.mq_pid).stop() - self.mq_pid = None + client = create_docker_client( + self.data_queue.address, + self.data_queue.ssh_username, + self.data_queue.ssh_password, + self.data_queue.ssh_keypath + ) + client.containers.get(self.data_queue.pid).stop() + self.data_queue.pid = None client.close() self.log.info('The RabbitMQ is stopped') def kill_mongodb(self) -> None: - if not self.mongo_pid: + if not self.data_mongo.pid: self.log.warning('No running MongoDB instance found') return - client = create_docker_client(self.config.mongo.address, self.config.mongo.username, - self.config.mongo.password, self.config.mongo.keypath) - client.containers.get(self.mongo_pid).stop() - self.mongo_pid = None + client = create_docker_client( + self.data_mongo.address, + self.data_mongo.ssh_username, + self.data_mongo.ssh_password, + self.data_mongo.ssh_keypath + ) + client.containers.get(self.data_mongo.pid).stop() + self.data_mongo.pid = None client.close() self.log.info('The MongoDB is stopped') def kill_hosts(self) -> None: self.log.debug('Starting to kill/stop hosts') # Kill processing hosts - for host in self.hosts: - self.log.debug(f'Killing/Stopping processing workers on host: {host.config.address}') - if host.ssh_client: - host.ssh_client = create_ssh_client(host.config.address, host.config.username, - host.config.password, host.config.keypath) - if host.docker_client: - host.docker_client = create_docker_client(host.config.address, host.config.username, - host.config.password, host.config.keypath) - # Kill deployed OCR-D processor instances on this Processing worker host - self.kill_processing_workers(host) - - # Kill deployed Processor Server instances on this host - self.kill_processor_servers(host) + for host_data in self.data_hosts: + if host_data.needs_ssh: + host_data.create_client(client_type='ssh') + assert host_data.ssh_client + if host_data.needs_docker: + host_data.create_client(client_type='ssh') + assert host_data.docker_client + + self.log.debug(f'Killing/Stopping processing workers on host: {host_data.address}') + self.kill_processing_workers(host_data) + + self.log.debug(f'Killing/Stopping processor servers on host: {host_data.address}') + self.kill_processor_servers(host_data) + + if host_data.ssh_client: + host_data.ssh_client.close() + host_data.ssh_client = None + if host_data.docker_client: + host_data.docker_client.close() + host_data.docker_client = None # TODO: Optimize the code duplication from start_* and kill_* methods - def kill_processing_workers(self, host: HostData) -> None: - amount = len(host.pids_native) - if amount: - self.log.info(f"Trying to kill/stop {amount} native processing workers:") - for pid in host.pids_native: - self.log.info(f"Native with PID: '{pid}'") - host.ssh_client.exec_command(f'kill {pid}') - host.pids_native = [] - amount = len(host.pids_docker) - if amount: - self.log.info(f"Trying to kill/stop {amount} docker processing workers:") - for pid in host.pids_docker: - self.log.info(f"Docker with PID: '{pid}'") - host.docker_client.containers.get(pid).stop() - host.pids_docker = [] - - def kill_processor_servers(self, host: HostData) -> None: - amount = len(host.processor_server_pids_native) - if amount: - self.log.info(f"Trying to kill/stop {amount} native processor servers:") - for pid in host.processor_server_pids_native: - self.log.info(f"Native with PID: '{pid}'") - host.ssh_client.exec_command(f'kill {pid}') - host.processor_server_pids_native = [] - amount = len(host.processor_server_pids_docker) - if amount: - self.log.info(f"Trying to kill/stop {amount} docker processor servers:") - for pid in host.processor_server_pids_docker: - self.log.info(f"Docker with PID: '{pid}'") - host.docker_client.containers.get(pid).stop() - host.processor_server_pids_docker = [] - - def start_native_processor(self, client: SSHClient, processor_name: str, queue_url: str, - database_url: str) -> str: + def kill_processing_workers(self, host_data: DataHost) -> None: + amount = len(host_data.data_workers) + if not amount: + self.log.info(f'No active processing workers to be stopped.') + return + self.log.info(f"Trying to stop {amount} processing workers:") + for worker in host_data.data_workers: + if not worker.pid: + continue + if worker.deploy_type == 'native': + host_data.ssh_client.exec_command(f'kill {worker.pid}') + self.log.info(f"Stopped native worker with pid: '{worker.pid}'") + if worker.deploy_type == 'docker': + host_data.docker_client.containers.get(worker.pid).stop() + self.log.info(f"Stopped docker worker with container id: '{worker.pid}'") + host_data.data_workers = [] + + def kill_processor_servers(self, host_data: DataHost) -> None: + amount = len(host_data.data_servers) + if not amount: + self.log.info(f'No active processor servers to be stopped.') + return + self.log.info(f"Trying to stop {amount} processing workers:") + for server in host_data.data_servers: + if not server.pid: + continue + if server.deploy_type == 'native': + host_data.ssh_client.exec_command(f'kill {server.pid}') + self.log.info(f"Stopped native server with pid: '{server.pid}'") + if server.deploy_type == 'docker': + host_data.docker_client.containers.get(server.pid).stop() + self.log.info(f"Stopped docker server with container id: '{server.pid}'") + host_data.data_servers = [] + + def start_native_processor( + self, + ssh_client, + processor_name: str, + queue_url: str, + database_url: str + ) -> str: """ start a processor natively on a host via ssh Args: - client: paramiko SSHClient to execute commands on a host + ssh_client: paramiko SSHClient to execute commands on a host processor_name: name of processor to run queue_url: url to rabbitmq database_url: url to database @@ -347,7 +429,7 @@ def start_native_processor(self, client: SSHClient, processor_name: str, queue_u str: pid of running process """ self.log.info(f'Starting native processing worker: {processor_name}') - channel = client.invoke_shell() + channel = ssh_client.invoke_shell() stdin, stdout = channel.makefile('wb'), channel.makefile('rb') cmd = f'{processor_name} --agent_type worker --database {database_url} --queue {queue_url}' # the only way (I could find) to make it work to start a process in the background and @@ -355,42 +437,51 @@ def start_native_processor(self, client: SSHClient, processor_name: str, queue_u # printed with `echo $!` but it is printed inbetween other output. Because of that I added # `xyz` before and after the code to easily be able to filter out the pid via regex when # returning from the function - logpath = '/tmp/ocrd-processing-server-startup.log' - stdin.write(f"echo starting processing worker with '{cmd}' >> '{logpath}'\n") - stdin.write(f'{cmd} >> {logpath} 2>&1 &\n') + log_path = '/tmp/ocrd-processing-server-startup.log' + stdin.write(f"echo starting processing worker with '{cmd}' >> '{log_path}'\n") + stdin.write(f'{cmd} >> {log_path} 2>&1 &\n') stdin.write('echo xyz$!xyz \n exit \n') output = stdout.read().decode('utf-8') stdout.close() stdin.close() return re_search(r'xyz([0-9]+)xyz', output).group(1) # type: ignore - def start_docker_processor(self, client: CustomDockerClient, processor_name: str, - queue_url: str, database_url: str) -> str: - + def start_docker_processor( + self, + docker_client, + processor_name: str, + _queue_url: str, + _database_url: str + ) -> str: # TODO: Raise an exception here as well? # raise Exception("Deploying docker processing worker is not supported yet!") self.log.info(f'Starting docker container processor: {processor_name}') # TODO: add real command here to start processing server in docker here - res = client.containers.run('debian', 'sleep 500s', detach=True, remove=True) + res = docker_client.containers.run('debian', 'sleep 500s', detach=True, remove=True) assert res and res.id, f'Running processor: {processor_name} in docker-container failed' return res.id # TODO: Just a copy of the above start_native_processor() method. # Far from being great... But should be good as a starting point - def start_native_processor_server(self, client: SSHClient, processor_name: str, agent_address: str, database_url: str) -> str: + def start_native_processor_server( + self, + ssh_client, + processor_name: str, + agent_address: str, + database_url: str + ) -> str: self.log.info(f"Starting native processor server: {processor_name} on {agent_address}") - channel = client.invoke_shell() + channel = ssh_client.invoke_shell() stdin, stdout = channel.makefile('wb'), channel.makefile('rb') cmd = f'{processor_name} --agent_type server --agent_address {agent_address} --database {database_url}' port = agent_address.split(':')[1] - logpath = f'/tmp/server_{processor_name}_{port}_{getpid()}.log' + log_path = f'/tmp/server_{processor_name}_{port}_{getpid()}.log' # TODO: This entire stdin/stdout thing is broken with servers! - stdin.write(f"echo starting processor server with '{cmd}' >> '{logpath}'\n") - stdin.write(f'{cmd} >> {logpath} 2>&1 &\n') + stdin.write(f"echo starting processor server with '{cmd}' >> '{log_path}'\n") + stdin.write(f'{cmd} >> {log_path} 2>&1 &\n') stdin.write('echo xyz$!xyz \n exit \n') output = stdout.read().decode('utf-8') stdout.close() stdin.close() return re_search(r'xyz([0-9]+)xyz', output).group(1) # type: ignore - pass diff --git a/ocrd_network/ocrd_network/deployment_config.py b/ocrd_network/ocrd_network/deployment_config.py deleted file mode 100644 index de5465a915..0000000000 --- a/ocrd_network/ocrd_network/deployment_config.py +++ /dev/null @@ -1,103 +0,0 @@ -from typing import Dict -from yaml import safe_load -from ocrd_validators import ProcessingServerConfigValidator -from .deployment_utils import DeployType - -__all__ = [ - 'ProcessingServerConfig', - 'HostConfig', - 'WorkerConfig', - 'MongoConfig', - 'ProcessorServerConfig', - 'QueueConfig', -] - - -class ProcessingServerConfig: - def __init__(self, config_path: str) -> None: - # Load and validate the config - with open(config_path) as fin: - config = safe_load(fin) - report = ProcessingServerConfigValidator.validate(config) - if not report.is_valid: - raise Exception(f'Processing-Server configuration file is invalid:\n{report.errors}') - - # Split the configurations - self.mongo = MongoConfig(config['database']) - self.queue = QueueConfig(config['process_queue']) - self.hosts = [] - for host in config['hosts']: - self.hosts.append(HostConfig(host)) - - -class HostConfig: - """Class to wrap information for all processing-worker-hosts. - - Config information and runtime information is stored here. This class - should not do much but hold config information and runtime information. I - hope to make the code better understandable this way. Deployer should still - be the class who does things and this class here should be mostly passive - """ - - def __init__(self, config: dict) -> None: - self.address = config['address'] - self.username = config['username'] - self.password = config.get('password', None) - self.keypath = config.get('path_to_privkey', None) - self.processors = [] - for worker in config['workers']: - deploy_type = DeployType.from_str(worker['deploy_type']) - self.processors.append( - WorkerConfig(worker['name'], worker['number_of_instance'], deploy_type) - ) - self.servers = [] - for server in config['servers']: - deploy_type = DeployType.from_str(server['deploy_type']) - self.servers.append( - ProcessorServerConfig(server['name'], deploy_type, server['port']) - ) - - -class WorkerConfig: - """ - Class wrapping information from config file for an OCR-D processor - """ - def __init__(self, name: str, count: int, deploy_type: DeployType) -> None: - self.name = name - self.count = count - self.deploy_type = deploy_type - - -# TODO: Not a big fan of the way these configs work... -# Implemented this way to fit the general logic of previous impl -class ProcessorServerConfig: - def __init__(self, name: str, deploy_type: DeployType, port: int): - self.name = name - self.deploy_type = deploy_type - self.port = port - - -class MongoConfig: - """ Class to hold information for Mongodb-Docker container - """ - - def __init__(self, config: Dict) -> None: - self.address = config['address'] - self.port = int(config['port']) - self.username = config['ssh']['username'] - self.keypath = config['ssh'].get('path_to_privkey', None) - self.password = config['ssh'].get('password', None) - self.credentials = (config['credentials']['username'], config['credentials']['password']) - - -class QueueConfig: - """ Class to hold information for RabbitMQ-Docker container - """ - - def __init__(self, config: Dict) -> None: - self.address = config['address'] - self.port = int(config['port']) - self.username = config['ssh']['username'] - self.keypath = config['ssh'].get('path_to_privkey', None) - self.password = config['ssh'].get('password', None) - self.credentials = (config['credentials']['username'], config['credentials']['password']) diff --git a/ocrd_network/ocrd_network/deployment_utils.py b/ocrd_network/ocrd_network/deployment_utils.py index 8f6c5f2724..561aff8f95 100644 --- a/ocrd_network/ocrd_network/deployment_utils.py +++ b/ocrd_network/ocrd_network/deployment_utils.py @@ -1,65 +1,32 @@ from __future__ import annotations -from enum import Enum -from typing import Union, List -from distutils.spawn import find_executable as which -import re - from docker import APIClient, DockerClient from docker.transport import SSHHTTPAdapter from paramiko import AutoAddPolicy, SSHClient +from time import sleep -from ocrd_utils import getLogger -from .deployment_config import * +from .rabbitmq_utils import RMQPublisher __all__ = [ 'create_docker_client', 'create_ssh_client', - 'CustomDockerClient', - 'DeployType', - 'HostData' + 'wait_for_rabbitmq_availability' ] -def create_ssh_client(address: str, username: str, password: Union[str, None], - keypath: Union[str, None]) -> SSHClient: +def create_ssh_client(address: str, username: str, password: str = "", keypath: str = "") -> SSHClient: client = SSHClient() client.set_missing_host_key_policy(AutoAddPolicy) try: client.connect(hostname=address, username=username, password=password, key_filename=keypath) - except Exception: - getLogger(__name__).error(f"Error creating SSHClient for host: '{address}'") - raise + except Exception as error: + raise Exception(f"Error creating SSHClient of host '{address}', reason:") from error return client -def create_docker_client(address: str, username: str, password: Union[str, None], - keypath: Union[str, None]) -> CustomDockerClient: +def create_docker_client(address: str, username: str, password: str = "", keypath: str = "") -> CustomDockerClient: return CustomDockerClient(username, address, password=password, keypath=keypath) -class HostData: - """class to store runtime information for a host - """ - def __init__(self, config: HostConfig) -> None: - self.config = config - self.ssh_client: Union[SSHClient, None] = None - self.docker_client: Union[CustomDockerClient, None] = None - self.pids_native: List[str] = [] - self.pids_docker: List[str] = [] - # TODO: Revisit this, currently just mimicking the old impl - self.processor_server_pids_native: List[str] = [] - self.processor_server_pids_docker: List[str] = [] - # Key: processor_name, Value: list of ports - self.processor_server_ports: dict = {} - - @staticmethod - def from_config(config: List[HostConfig]) -> List[HostData]: - res = [] - for host_config in config: - res.append(HostData(host_config)) - return res - - class CustomDockerClient(DockerClient): """Wrapper for docker.DockerClient to use an own SshHttpAdapter. @@ -85,21 +52,20 @@ def __init__(self, user: str, host: str, **kwargs) -> None: # the super-constructor is not called on purpose: it solely instantiates the APIClient. The # missing `version` in that call would raise an error. APIClient is provided here as a # replacement for what the super-constructor does - if not user or not host: + if not (user and host): raise ValueError('Missing argument: user and host must both be provided') - if 'password' not in kwargs and 'keypath' not in kwargs: + if bool('password' not in kwargs) != ('keypath' not in kwargs): raise ValueError('Missing argument: one of password and keyfile is needed') self.api = APIClient(f'ssh://{host}', use_ssh_client=True, version='1.41') ssh_adapter = self.CustomSshHttpAdapter(f'ssh://{user}@{host}:22', **kwargs) self.api.mount('http+docker://ssh', ssh_adapter) class CustomSshHttpAdapter(SSHHTTPAdapter): - def __init__(self, base_url, password: Union[str, None] = None, - keypath: Union[str, None] = None) -> None: + def __init__(self, base_url, password: str = "", keypath: str = "") -> None: self.password = password self.keypath = keypath - if not self.password and not self.keypath: - raise Exception("either 'password' or 'keypath' must be provided") + if bool(self.password) == bool(self.keypath): + raise Exception("Either 'password' or 'keypath' must be provided") super().__init__(base_url) def _create_paramiko_client(self, base_url: str) -> None: @@ -115,18 +81,22 @@ def _create_paramiko_client(self, base_url: str) -> None: self.ssh_client.set_missing_host_key_policy(AutoAddPolicy) -class DeployType(Enum): - """ Deploy-Type of the processing server. - """ - docker = 1 - native = 2 - - @staticmethod - def from_str(label: str) -> DeployType: - return DeployType[label.lower()] - - def is_native(self) -> bool: - return self == DeployType.native - - def is_docker(self) -> bool: - return self == DeployType.docker +def wait_for_rabbitmq_availability( + host: str, + port: int, + vhost: str, + username: str, + password: str +) -> None: + max_waiting_steps = 15 + while max_waiting_steps > 0: + try: + dummy_publisher = RMQPublisher(host=host, port=port, vhost=vhost) + dummy_publisher.authenticate_and_connect(username=username, password=password) + except Exception: + max_waiting_steps -= 1 + sleep(2) + else: + # TODO: Disconnect the dummy_publisher here before returning... + return + raise RuntimeError('Error waiting for queue startup: timeout exceeded') diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 416da042b3..528136031a 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -9,10 +9,9 @@ from pika.exceptions import ChannelClosedByBroker -from ocrd_utils import getLogger, get_ocrd_tool_json +from ocrd_utils import getLogger from .database import initiate_database from .deployer import Deployer -from .deployment_config import ProcessingServerConfig from .models import ( DBProcessorJob, PYJobInput, @@ -25,7 +24,11 @@ validate_and_resolve_mets_path, validate_job_input, ) -from .utils import generate_created_time, generate_id +from .utils import ( + download_ocrd_all_tool_json, + generate_created_time, + generate_id +) class ProcessingServer(FastAPI): @@ -44,29 +47,27 @@ def __init__(self, config_path: str, host: str, port: int) -> None: title='OCR-D Processing Server', description='OCR-D processing and processors') self.log = getLogger(__name__) + self.log.info(f"Downloading ocrd all tool json") + self.ocrd_all_tool_json = download_ocrd_all_tool_json( + ocrd_all_url="https://ocr-d.de/js/ocrd-all-tool.json" + ) self.hostname = host self.port = port - self.config = ProcessingServerConfig(config_path) - self.deployer = Deployer(self.config) + # The deployer is used for: + # - deploying agents when the Processing Server is started + # - retrieving runtime data of agents + self.deployer = Deployer(config_path) self.mongodb_url = None - self.rmq_host = self.config.queue.address - self.rmq_port = self.config.queue.port + # TODO: Combine these under a single URL, rabbitmq_utils needs an update + self.rmq_host = self.deployer.data_queue.address + self.rmq_port = self.deployer.data_queue.port self.rmq_vhost = '/' - self.rmq_username = self.config.queue.credentials[0] - self.rmq_password = self.config.queue.credentials[1] + self.rmq_username = self.deployer.data_queue.username + self.rmq_password = self.deployer.data_queue.password # Gets assigned when `connect_publisher` is called on the working object self.rmq_publisher = None - # TODO: These will change dynamically - # according to the new requirements - # This list holds a set of all processing worker - # names mentioned in the config file - self._processing_workers_list = None - # This list holds a set of all processor server - # names mentioned in the config file - self._processor_servers_list = None - # Create routes self.router.add_api_route( path='/stop', @@ -129,27 +130,21 @@ def start(self) -> None: """ deploy agents (db, queue, workers) and start the processing server with uvicorn """ try: - rabbitmq_hostinfo = self.deployer.deploy_rabbitmq( - image='rabbitmq:3-management', detach=True, remove=True) - - # Assign the credentials to the rabbitmq url parameter - rabbitmq_url = f'amqp://{self.rmq_username}:{self.rmq_password}@{rabbitmq_hostinfo}' - - mongodb_hostinfo = self.deployer.deploy_mongodb( - image='mongo', detach=True, remove=True) + self.deployer.deploy_rabbitmq(image='rabbitmq:3-management', detach=True, remove=True) + rabbitmq_url = self.deployer.data_queue.url - self.mongodb_url = f'mongodb://{mongodb_hostinfo}' + self.deployer.deploy_mongodb(image='mongo', detach=True, remove=True) + self.mongodb_url = self.deployer.data_mongo.url # The RMQPublisher is initialized and a connection to the RabbitMQ is performed self.connect_publisher() - self.log.debug(f'Creating message queues on RabbitMQ instance url: {rabbitmq_url}') self.create_message_queues() - # Deploy processing hosts where processing workers are running on - # Note: A deployed processing worker starts listening to a message queue with id - # processor.name - self.deployer.deploy_hosts(rabbitmq_url, self.mongodb_url) + self.deployer.deploy_hosts( + mongodb_url=self.mongodb_url, + rabbitmq_url=rabbitmq_url + ) except Exception: self.log.error('Error during startup of processing server. ' 'Trying to kill parts of incompletely deployed service') @@ -191,37 +186,30 @@ def connect_publisher(self, enable_acks: bool = True) -> None: self.log.info('Successfully connected RMQPublisher.') def create_message_queues(self) -> None: - """Create the message queues based on the occurrence of `processor.name` in the config file + """ Create the message queues based on the occurrence of + `workers.name` in the config file. + """ + + # TODO: Remove + """ + queue_names = set([]) + for data_host in self.deployer.data_hosts: + for data_worker in data_host.data_workers: + queue_names.add(data_worker.processor_name) """ - for host in self.config.hosts: - for processor in host.processors: - # The existence/validity of the processor.name is not tested. - # Even if an ocr-d processor does not exist, the queue is created - self.log.info(f'Creating a message queue with id: {processor.name}') - self.rmq_publisher.create_queue(queue_name=processor.name) - - @property - def processing_workers_list(self): - if self._processing_workers_list: - return self._processing_workers_list - res = set([]) - for host in self.config.hosts: - for processor in host.processors: - res.add(processor.name) - self._processing_workers_list = list(res) - return self._processing_workers_list - - # TODO: Revisit. This is just mimicking the method above. - @property - def processor_servers_list(self): - if self._processor_servers_list: - return self._processor_servers_list - res = set([]) - for host in self.config.hosts: - for processor_server in host.servers: - res.add(processor_server.name) - self._processor_servers_list = list(res) - return self._processor_server_list + + # The abstract version of the above lines + queue_names = self.deployer.find_matching_processors( + worker_only=True, + str_names_only=True, + unique_only=True + ) + + for queue_name in queue_names: + # The existence/validity of the worker.name is not tested. + # Even if an ocr-d processor does not exist, the queue is created + self.log.info(f'Creating a message queue with id: {queue_name}') + self.rmq_publisher.create_queue(queue_name=queue_name) @staticmethod def create_processing_message(job: DBProcessorJob) -> OcrdProcessingMessage: @@ -255,14 +243,7 @@ def check_if_queue_exists(self, processor_name): ) def query_ocrd_tool_json_from_server(self, processor_name): - processor_server_url = None - # Check if a processor server with processor_name was deployed - # TODO: Revisit when the config file classes are refactored (made more abstract). - # This is such a mess now due to the bad abstraction and bad naming conventions! - for host_config in self.config.hosts: - for processor_server in host_config.servers: - if processor_server.name == processor_name: - processor_server_url = f"http://{host_config.address}:{processor_server.port}/" + processor_server_url = self.deployer.resolve_processor_server_url(processor_name) if not processor_server_url: self.log.exception(f"Processor Server of '{processor_name}' is not available") raise HTTPException( @@ -304,14 +285,19 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ # TODO: Revisit and remove duplications between push_to_* methods async def push_to_processing_queue(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput: - # TODO: Getting the tool shall be adapted to the change in #1028 - ocrd_tool = get_ocrd_tool_json(processor_name) + ocrd_tool = await self.get_processor_info(processor_name) validate_job_input(self.log, processor_name, ocrd_tool, job_input) job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=False) if not self.rmq_publisher: raise Exception('RMQPublisher is not connected') - if processor_name not in self.processing_workers_list: + deployed_processors = self.deployer.find_matching_processors( + worker_only=True, + str_names_only=True, + unique_only=True + ) + if processor_name not in deployed_processors: self.check_if_queue_exists(processor_name) + job = DBProcessorJob( **job_input.dict(exclude_unset=True, exclude_none=True), job_id=generate_id(), @@ -364,23 +350,25 @@ async def get_processor_job(self, processor_name: str, job_id: str) -> PYJobOutp async def get_processor_info(self, processor_name) -> Dict: """ Return a processor's ocrd-tool.json """ - if processor_name not in self._processing_workers_list: + ocrd_tool = self.ocrd_all_tool_json.get(processor_name, None) + if not ocrd_tool: raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail='Processor not available' + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Ocrd tool JSON of '{processor_name}' not available!" ) - return get_ocrd_tool_json(processor_name) + + # TODO: Returns the ocrd tool json even of processors + # that are not deployed. This may or may not be desired. + return ocrd_tool async def list_processors(self) -> List[str]: - """ Return a list of all available processors - """ - processor_names_list = [] - - # TODO: 1) Revisit this. Currently, it adds labels in - # front of the names for differentiation purposes - # TODO: 2) This could be optimized by holding a dynamic list - for worker_name in self.processing_workers_list: - processor_names_list.append(f'worker {worker_name}') - for server_name in self.processor_servers_list: - processor_names_list.append(f'server {server_name}') + # There is no caching on the Processing Server side + processor_names_list = self.deployer.find_matching_processors( + docker_only=False, + native_only=False, + worker_only=False, + server_only=False, + str_names_only=True, + unique_only=True + ) return processor_names_list diff --git a/ocrd_network/ocrd_network/runtime_data.py b/ocrd_network/ocrd_network/runtime_data.py new file mode 100644 index 0000000000..9ffb304429 --- /dev/null +++ b/ocrd_network/ocrd_network/runtime_data.py @@ -0,0 +1,122 @@ +from __future__ import annotations +from typing import Dict, List + +from .deployment_utils import ( + create_docker_client, + create_ssh_client +) + +__all__ = [ + 'DataHost', + 'DataMongoDB', + 'DataProcessingWorker', + 'DataProcessorServer', + 'DataRabbitMQ' +] + + +class DataHost: + def __init__(self, config: Dict) -> None: + self.address = config['address'] + self.username = config['username'] + self.password = config.get('password', None) + self.keypath = config.get('path_to_privkey', None) + + # These flags are used to track whether a connection + # of the specified type will be required + self.needs_ssh: bool = False + self.needs_docker: bool = False + + self.ssh_client = None + self.docker_client = None + + # TODO: Not sure this is DS is ideal, seems off + self.data_workers: List[DataProcessingWorker] = [] + self.data_servers: List[DataProcessorServer] = [] + + for worker in config['workers']: + name = worker['name'] + count = worker['number_of_instance'] + deploy_type = worker['deploy_type'] + for _ in range(count): + self.data_workers.append(DataProcessingWorker(self.address, deploy_type, name)) + if not self.needs_ssh and deploy_type == 'native': + self.needs_ssh = True + if not self.needs_docker and deploy_type == 'docker': + self.needs_docker = True + + for server in config['servers']: + name = server['name'] + deploy_type = server['deploy_type'] + port = server['port'] + self.data_servers.append(DataProcessorServer(self.address, port, deploy_type, name)) + if not self.needs_ssh and deploy_type == 'native': + self.needs_ssh = True + if not self.needs_docker and deploy_type == 'docker': + self.needs_docker = True + + # Key: processor_name, Value: list of ports + self.server_ports: dict = {} + + def create_client(self, client_type: str): + if client_type not in ['docker', 'ssh']: + raise ValueError(f'Host client type cannot be of type: {client_type}') + if client_type == 'ssh': + if not self.ssh_client: + self.ssh_client = create_ssh_client( + self.address, self.username, self.password, self.keypath) + return self.ssh_client + if client_type == 'docker': + if not self.docker_client: + self.docker_client = create_docker_client( + self.address, self.username, self.password, self.keypath + ) + return self.docker_client + + +class DataProcessingWorker: + def __init__(self, host: str, deploy_type: str, processor_name: str) -> None: + self.host = host + self.deploy_type = deploy_type + self.processor_name = processor_name + # Assigned when deployed + self.pid = None + + +class DataProcessorServer: + def __init__(self, host: str, port: int, deploy_type: str, processor_name: str) -> None: + self.host = host + self.port = port + self.deploy_type = deploy_type + self.processor_name = processor_name + # Assigned when deployed + self.pid = None + + +class DataMongoDB: + def __init__(self, config: Dict) -> None: + self.address = config['address'] + self.port = int(config['port']) + self.ssh_username = config['ssh']['username'] + self.ssh_keypath = config['ssh'].get('path_to_privkey', None) + self.ssh_password = config['ssh'].get('password', None) + self.username = config['credentials']['username'] + self.password = config['credentials']['password'] + self.url = f'mongodb://{self.address}:{self.port}' + # Assigned when deployed + self.pid = None + + +class DataRabbitMQ: + def __init__(self, config: Dict) -> None: + self.address = config['address'] + self.port = int(config['port']) + self.ssh_username = config['ssh']['username'] + self.ssh_keypath = config['ssh'].get('path_to_privkey', None) + self.ssh_password = config['ssh'].get('password', None) + self.vhost = '/' + self.username = config['credentials']['username'] + self.password = config['credentials']['password'] + self.url = f'amqp://{self.username}:{self.password}@{self.address}:{self.port}{self.vhost}' + # Assigned when deployed + self.pid = None diff --git a/ocrd_network/ocrd_network/utils.py b/ocrd_network/ocrd_network/utils.py index 568544c67f..d41a1b13ab 100644 --- a/ocrd_network/ocrd_network/utils.py +++ b/ocrd_network/ocrd_network/utils.py @@ -1,10 +1,15 @@ from datetime import datetime from functools import wraps -from re import match as re_match from os import environ from pika import URLParameters from pymongo import uri_parser as mongo_uri_parser +from re import match as re_match +import requests +from typing import Dict from uuid import uuid4 +from yaml import safe_load + +from ocrd_validators import ProcessingServerConfigValidator # Based on: https://gist.github.com/phizaz/20c36c6734878c6ec053245a477572ec @@ -54,6 +59,16 @@ def generate_id() -> str: return str(uuid4()) +def validate_and_load_config(config_path: str) -> Dict: + # Load and validate the config + with open(config_path) as fin: + config = safe_load(fin) + report = ProcessingServerConfigValidator.validate(config) + if not report.is_valid: + raise Exception(f'Processing-Server configuration file is invalid:\n{report.errors}') + return config + + def verify_database_uri(mongodb_address: str) -> str: try: # perform validation check @@ -83,3 +98,13 @@ def verify_and_parse_mq_uri(rabbitmq_address: str): 'vhost': url_params.virtual_host } return parsed_data + + +def download_ocrd_all_tool_json(ocrd_all_url: str): + if not ocrd_all_url: + raise ValueError(f'The URL of ocrd all tool json is empty') + headers = {'Accept': 'application/json'} + response = requests.get(ocrd_all_url, headers=headers) + if not response.status_code == 200: + raise ValueError(f"Failed to download ocrd all tool json from: '{ocrd_all_url}'") + return response.json() From 94c76e039d00f6d5728b0acae12f6bda1f29668f Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Sat, 1 Apr 2023 02:28:16 +0200 Subject: [PATCH 13/34] small improvements --- ocrd/ocrd/cli/processing_server.py | 4 +- ocrd/ocrd/cli/processing_worker.py | 2 +- ocrd/ocrd/cli/processor_server.py | 10 +- ocrd/ocrd/decorators/__init__.py | 101 +++++++++--------- ocrd/ocrd/decorators/ocrd_cli_options.py | 10 +- ocrd_network/ocrd_network/__init__.py | 2 +- ocrd_network/ocrd_network/deployer.py | 4 +- ocrd_network/ocrd_network/param_validators.py | 4 +- 8 files changed, 72 insertions(+), 65 deletions(-) diff --git a/ocrd/ocrd/cli/processing_server.py b/ocrd/ocrd/cli/processing_server.py index a65e02a71d..3f1d50fae9 100644 --- a/ocrd/ocrd/cli/processing_server.py +++ b/ocrd/ocrd/cli/processing_server.py @@ -10,7 +10,7 @@ from ocrd_utils import initLogging from ocrd_network import ( ProcessingServer, - ProcessingServerParamType + ServerAddressParamType, ) @@ -19,7 +19,7 @@ @click.option('-a', '--address', default="localhost:8080", help='The URL of the Processing server, format: host:port', - type=ProcessingServerParamType(), + type=ServerAddressParamType(), required=True) def processing_server_cli(path_to_config, address: str): """ diff --git a/ocrd/ocrd/cli/processing_worker.py b/ocrd/ocrd/cli/processing_worker.py index 75828732d1..f9b5c8f1bc 100644 --- a/ocrd/ocrd/cli/processing_worker.py +++ b/ocrd/ocrd/cli/processing_worker.py @@ -30,7 +30,7 @@ help='The URL of the MongoDB, format: mongodb://host:port', type=DatabaseParamType(), required=True) -def processing_worker_cli(processor_name: str, agent_type: str, queue: str, database: str): +def processing_worker_cli(processor_name: str, queue: str, database: str): """ Start a processing worker (a specific ocr-d processor) """ diff --git a/ocrd/ocrd/cli/processor_server.py b/ocrd/ocrd/cli/processor_server.py index bd272478a4..ef45f61aa1 100644 --- a/ocrd/ocrd/cli/processor_server.py +++ b/ocrd/ocrd/cli/processor_server.py @@ -10,23 +10,23 @@ from ocrd_utils import initLogging from ocrd_network import ( DatabaseParamType, - ProcessingServerParamType, + ServerAddressParamType, ProcessorServer, ) @click.command('processor-server') @click.argument('processor_name', required=True, type=click.STRING) -@click.option('--agent_address', +@click.option('-a', '--address', help='The URL of the processor server, format: host:port', - type=ProcessingServerParamType(), + type=ServerAddressParamType(), required=True) @click.option('-d', '--database', default="mongodb://localhost:27018", help='The URL of the MongoDB, format: mongodb://host:port', type=DatabaseParamType(), required=True) -def processor_server_cli(processor_name: str, agent_type: str, agent_address: str, database: str): +def processor_server_cli(processor_name: str, address: str, database: str): """ Start ocr-d processor as a server """ @@ -36,7 +36,7 @@ def processor_server_cli(processor_name: str, agent_type: str, agent_address: st try: # TODO: Better validate that inside the ProcessorServer itself - host, port = agent_address.split(':') + host, port = address.split(':') processor_server = ProcessorServer( mongodb_addr=database, processor_name=processor_name, diff --git a/ocrd/ocrd/decorators/__init__.py b/ocrd/ocrd/decorators/__init__.py index ca5d308e9e..98bbb4ba35 100644 --- a/ocrd/ocrd/decorators/__init__.py +++ b/ocrd/ocrd/decorators/__init__.py @@ -128,54 +128,57 @@ def exit(): def check_and_run_network_agent(ProcessorClass, agent_type: str, agent_address: str, database: str, queue: str): if not agent_type and (agent_address or database or queue): - raise ValueError("Options '--database', '--queue', and 'agent_address' are valid only with '--agent_type'") - if agent_type: - if not database: - raise ValueError("Options '--agent_type' and '--database' are mutually inclusive") - allowed_agent_types = ['server', 'worker'] - if agent_type not in allowed_agent_types: - agents_str = ', '.join(allowed_agent_types) - raise ValueError(f"Wrong agent type parameter. Allowed agent types: {agents_str}") - if agent_type == 'server': - if not agent_address: - raise ValueError("Options '--agent_type=server' and '--agent_address' are mutually inclusive") - if queue: - raise ValueError("Options '--agent_type=server' and '--queue' are mutually exclusive") - if agent_type == 'worker': - if not queue: - raise ValueError("Options '--agent_type=worker' and '--queue' are mutually inclusive") - if agent_address: - raise ValueError("Options '--agent_type=worker' and '--agent_address' are mutually exclusive") + raise ValueError("Options '--database', '--queue', and '--address' are valid only with '--type'") + if not agent_type: + return - import logging - logging.getLogger('ocrd.network').setLevel(logging.DEBUG) + if not database: + raise ValueError("Options '--type' and '--database' are mutually inclusive") + allowed_agent_types = ['server', 'worker'] + if agent_type not in allowed_agent_types: + agents_str = ', '.join(allowed_agent_types) + raise ValueError(f"Wrong type parameter. Allowed types: {agents_str}") + if agent_type == 'server': + if not agent_address: + raise ValueError("Options '--type=server' and '--address' are mutually inclusive") + if queue: + raise ValueError("Options '--type=server' and '--queue' are mutually exclusive") + if agent_type == 'worker': + if not queue: + raise ValueError("Options '--type=worker' and '--queue' are mutually inclusive") + if agent_address: + raise ValueError("Options '--type=worker' and '--address' are mutually exclusive") - processor = ProcessorClass(workspace=None, dump_json=True) - if agent_type == 'worker': - try: - # TODO: Passing processor_name and ocrd_tool is reduntant - processing_worker = ProcessingWorker( - rabbitmq_addr=queue, - mongodb_addr=database, - processor_name=processor.ocrd_tool['executable'], - ocrd_tool=processor.ocrd_tool, - processor_class=ProcessorClass, - ) - # The RMQConsumer is initialized and a connection to the RabbitMQ is performed - processing_worker.connect_consumer() - # Start consuming from the queue with name `processor_name` - processing_worker.start_consuming() - except Exception as e: - sys.exit(f"Processing worker has failed with error: {e}") - if agent_type == 'server': - try: - # TODO: Better validate that inside the ProcessorServer itself - host, port = agent_address.split(':') - processor_server = ProcessorServer( - mongodb_addr=database, - processor_name=processor.ocrd_tool['executable'], - processor_class=ProcessorClass, - ) - processor_server.run_server(host=host, port=int(port)) - except Exception as e: - sys.exit(f"Processor server has failed with error: {e}") + import logging + logging.getLogger('ocrd.network').setLevel(logging.DEBUG) + + processor = ProcessorClass(workspace=None, dump_json=True) + if agent_type == 'worker': + try: + # TODO: Passing processor_name and ocrd_tool is reduntant + processing_worker = ProcessingWorker( + rabbitmq_addr=queue, + mongodb_addr=database, + processor_name=processor.ocrd_tool['executable'], + ocrd_tool=processor.ocrd_tool, + processor_class=ProcessorClass, + ) + # The RMQConsumer is initialized and a connection to the RabbitMQ is performed + processing_worker.connect_consumer() + # Start consuming from the queue with name `processor_name` + processing_worker.start_consuming() + except Exception as e: + sys.exit(f"Processing worker has failed with error: {e}") + if agent_type == 'server': + try: + # TODO: Better validate that inside the ProcessorServer itself + host, port = agent_address.split(':') + processor_server = ProcessorServer( + mongodb_addr=database, + processor_name=processor.ocrd_tool['executable'], + processor_class=ProcessorClass, + ) + processor_server.run_server(host=host, port=int(port)) + except Exception as e: + sys.exit(f"Processor server has failed with error: {e}") + sys.exit(0) diff --git a/ocrd/ocrd/decorators/ocrd_cli_options.py b/ocrd/ocrd/decorators/ocrd_cli_options.py index 42bed275bb..5723471ce7 100644 --- a/ocrd/ocrd/decorators/ocrd_cli_options.py +++ b/ocrd/ocrd/decorators/ocrd_cli_options.py @@ -2,7 +2,11 @@ from click import option, Path from .parameter_option import parameter_option, parameter_override_option from .loglevel_option import loglevel_option -from ocrd_network import QueueServerParamType, DatabaseParamType +from ocrd_network import ( + DatabaseParamType, + ServerAddressParamType, + QueueServerParamType +) def ocrd_cli_options(f): @@ -34,8 +38,8 @@ def cli(mets_url): parameter_option, parameter_override_option, loglevel_option, - option('--agent_type', type=click.STRING), - option('--agent_address', type=click.STRING), + option('--type', 'agent_type', type=click.Choice(['worker', 'server'])), + option('--address', 'agent_address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), option('-C', '--show-resource'), diff --git a/ocrd_network/ocrd_network/__init__.py b/ocrd_network/ocrd_network/__init__.py index e751ecbefb..840864fed8 100644 --- a/ocrd_network/ocrd_network/__init__.py +++ b/ocrd_network/ocrd_network/__init__.py @@ -27,6 +27,6 @@ from .processor_server import ProcessorServer from .param_validators import ( DatabaseParamType, - ProcessingServerParamType, + ServerAddressParamType, QueueServerParamType ) diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index 2c9af781b9..c8302281ab 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -431,7 +431,7 @@ def start_native_processor( self.log.info(f'Starting native processing worker: {processor_name}') channel = ssh_client.invoke_shell() stdin, stdout = channel.makefile('wb'), channel.makefile('rb') - cmd = f'{processor_name} --agent_type worker --database {database_url} --queue {queue_url}' + cmd = f'{processor_name} --type worker --database {database_url} --queue {queue_url}' # the only way (I could find) to make it work to start a process in the background and # return early is this construction. The pid of the last started background process is # printed with `echo $!` but it is printed inbetween other output. Because of that I added @@ -474,7 +474,7 @@ def start_native_processor_server( self.log.info(f"Starting native processor server: {processor_name} on {agent_address}") channel = ssh_client.invoke_shell() stdin, stdout = channel.makefile('wb'), channel.makefile('rb') - cmd = f'{processor_name} --agent_type server --agent_address {agent_address} --database {database_url}' + cmd = f'{processor_name} --type server --address {agent_address} --database {database_url}' port = agent_address.split(':')[1] log_path = f'/tmp/server_{processor_name}_{port}_{getpid()}.log' # TODO: This entire stdin/stdout thing is broken with servers! diff --git a/ocrd_network/ocrd_network/param_validators.py b/ocrd_network/ocrd_network/param_validators.py index 8e46694516..87cfeee722 100644 --- a/ocrd_network/ocrd_network/param_validators.py +++ b/ocrd_network/ocrd_network/param_validators.py @@ -6,8 +6,8 @@ ) -class ProcessingServerParamType(ParamType): - name = 'Processing server string format' +class ServerAddressParamType(ParamType): + name = 'Server address string format' expected_format = 'host:port' def convert(self, value, param, ctx): From bf44190fabdfa69a373ea125dbc6c680609e2326 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 3 Apr 2023 11:10:59 +0200 Subject: [PATCH 14/34] return: deploy type as enum --- ocrd_network/ocrd_network/deployer.py | 30 +++++++++---------- ocrd_network/ocrd_network/deployment_utils.py | 11 ++++++- ocrd_network/ocrd_network/runtime_data.py | 25 ++++++++-------- .../processing_server_config.schema.yml | 2 +- 4 files changed, 39 insertions(+), 29 deletions(-) diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index c8302281ab..491e6a632d 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -16,6 +16,7 @@ from .deployment_utils import ( create_docker_client, + DeployType, wait_for_rabbitmq_availability ) @@ -78,16 +79,16 @@ def find_matching_processors( for data_host in self.data_hosts: if not server_only: for data_worker in data_host.data_workers: - if data_worker.deploy_type == 'native' and docker_only: + if data_worker.deploy_type == DeployType.NATIVE and docker_only: continue - if data_worker.deploy_type == 'docker' and native_only: + if data_worker.deploy_type == DeployType.DOCKER and native_only: continue matched_objects.append(data_worker) if not worker_only: for data_server in data_host.data_servers: - if data_server.deploy_type == 'native' and docker_only: + if data_server.deploy_type == DeployType.NATIVE and docker_only: continue - if data_server.deploy_type == 'docker' and native_only: + if data_server.deploy_type == DeployType.DOCKER and native_only: continue matched_objects.append(data_server) if str_names_only: @@ -167,7 +168,7 @@ def _deploy_processing_worker( f"name: '{data_worker.processor_name}', " f"address: '{host_data.address}'") - if data_worker.deploy_type == 'native': + if data_worker.deploy_type == DeployType.NATIVE: assert host_data.ssh_client # to satisfy mypy pid = self.start_native_processor( ssh_client=host_data.ssh_client, @@ -176,8 +177,7 @@ def _deploy_processing_worker( database_url=mongodb_url, ) data_worker.pid = pid - else: - assert data_worker.deploy_type == 'docker' + elif data_worker.deploy_type == DeployType.DOCKER: assert host_data.docker_client # to satisfy mypy pid = self.start_docker_processor( docker_client=host_data.docker_client, @@ -200,7 +200,7 @@ def _deploy_processor_server( f"name: '{data_server.processor_name}', " f"address: '{data_server.host}:{data_server.port}'") - if data_server.deploy_type == 'native': + if data_server.deploy_type == DeployType.NATIVE: assert host_data.ssh_client pid = self.start_native_processor_server( ssh_client=host_data.ssh_client, @@ -219,7 +219,7 @@ def _deploy_processor_server( host_data.server_ports[name] = [port] else: host_data.server_ports[data_server.processor_name] = [data_server.port] - else: + elif data_server.deploy_type == DeployType.DOCKER: raise Exception("Deploying docker processor server is not supported yet!") def deploy_rabbitmq( @@ -319,7 +319,7 @@ def deploy_mongodb( mongodb_hostinfo = f'{self.data_mongo.address}:{self.data_mongo.port}' self.log.info(f'The MongoDB was deployed on host: {mongodb_hostinfo}') - return mongodb_hostinfo + return self.data_mongo.url def kill_rabbitmq(self) -> None: if not self.data_queue.pid: @@ -359,7 +359,7 @@ def kill_hosts(self) -> None: host_data.create_client(client_type='ssh') assert host_data.ssh_client if host_data.needs_docker: - host_data.create_client(client_type='ssh') + host_data.create_client(client_type='docker') assert host_data.docker_client self.log.debug(f'Killing/Stopping processing workers on host: {host_data.address}') @@ -385,10 +385,10 @@ def kill_processing_workers(self, host_data: DataHost) -> None: for worker in host_data.data_workers: if not worker.pid: continue - if worker.deploy_type == 'native': + if worker.deploy_type == DeployType.NATIVE: host_data.ssh_client.exec_command(f'kill {worker.pid}') self.log.info(f"Stopped native worker with pid: '{worker.pid}'") - if worker.deploy_type == 'docker': + elif worker.deploy_type == DeployType.DOCKER: host_data.docker_client.containers.get(worker.pid).stop() self.log.info(f"Stopped docker worker with container id: '{worker.pid}'") host_data.data_workers = [] @@ -402,10 +402,10 @@ def kill_processor_servers(self, host_data: DataHost) -> None: for server in host_data.data_servers: if not server.pid: continue - if server.deploy_type == 'native': + if server.deploy_type == DeployType.NATIVE: host_data.ssh_client.exec_command(f'kill {server.pid}') self.log.info(f"Stopped native server with pid: '{server.pid}'") - if server.deploy_type == 'docker': + elif server.deploy_type == DeployType.DOCKER: host_data.docker_client.containers.get(server.pid).stop() self.log.info(f"Stopped docker server with container id: '{server.pid}'") host_data.data_servers = [] diff --git a/ocrd_network/ocrd_network/deployment_utils.py b/ocrd_network/ocrd_network/deployment_utils.py index 561aff8f95..9be063cb2c 100644 --- a/ocrd_network/ocrd_network/deployment_utils.py +++ b/ocrd_network/ocrd_network/deployment_utils.py @@ -1,4 +1,5 @@ from __future__ import annotations +from enum import Enum from docker import APIClient, DockerClient from docker.transport import SSHHTTPAdapter from paramiko import AutoAddPolicy, SSHClient @@ -9,6 +10,7 @@ __all__ = [ 'create_docker_client', 'create_ssh_client', + 'DeployType', 'wait_for_rabbitmq_availability' ] @@ -54,7 +56,7 @@ def __init__(self, user: str, host: str, **kwargs) -> None: # replacement for what the super-constructor does if not (user and host): raise ValueError('Missing argument: user and host must both be provided') - if bool('password' not in kwargs) != ('keypath' not in kwargs): + if ('password' not in kwargs) != ('keypath' not in kwargs): raise ValueError('Missing argument: one of password and keyfile is needed') self.api = APIClient(f'ssh://{host}', use_ssh_client=True, version='1.41') ssh_adapter = self.CustomSshHttpAdapter(f'ssh://{user}@{host}:22', **kwargs) @@ -100,3 +102,10 @@ def wait_for_rabbitmq_availability( # TODO: Disconnect the dummy_publisher here before returning... return raise RuntimeError('Error waiting for queue startup: timeout exceeded') + + +class DeployType(Enum): + """ Deploy-Type of the processing worker/processor server. + """ + DOCKER = 1 + NATIVE = 2 diff --git a/ocrd_network/ocrd_network/runtime_data.py b/ocrd_network/ocrd_network/runtime_data.py index 9ffb304429..8986ddb613 100644 --- a/ocrd_network/ocrd_network/runtime_data.py +++ b/ocrd_network/ocrd_network/runtime_data.py @@ -3,7 +3,8 @@ from .deployment_utils import ( create_docker_client, - create_ssh_client + create_ssh_client, + DeployType ) __all__ = [ @@ -37,23 +38,23 @@ def __init__(self, config: Dict) -> None: for worker in config['workers']: name = worker['name'] count = worker['number_of_instance'] - deploy_type = worker['deploy_type'] - for _ in range(count): - self.data_workers.append(DataProcessingWorker(self.address, deploy_type, name)) - if not self.needs_ssh and deploy_type == 'native': + deploy_type = DeployType.DOCKER if worker['deploy_type'] == 'docker' else DeployType.NATIVE + if not self.needs_ssh and deploy_type == DeployType.NATIVE: self.needs_ssh = True - if not self.needs_docker and deploy_type == 'docker': + if not self.needs_docker and deploy_type == DeployType.DOCKER: self.needs_docker = True + for _ in range(count): + self.data_workers.append(DataProcessingWorker(self.address, deploy_type, name)) for server in config['servers']: name = server['name'] - deploy_type = server['deploy_type'] port = server['port'] - self.data_servers.append(DataProcessorServer(self.address, port, deploy_type, name)) - if not self.needs_ssh and deploy_type == 'native': + deploy_type = DeployType.DOCKER if server['deploy_type'] == 'docker' else DeployType.NATIVE + if not self.needs_ssh and deploy_type == DeployType.NATIVE: self.needs_ssh = True - if not self.needs_docker and deploy_type == 'docker': + if not self.needs_docker and deploy_type == DeployType.DOCKER: self.needs_docker = True + self.data_servers.append(DataProcessorServer(self.address, port, deploy_type, name)) # Key: processor_name, Value: list of ports self.server_ports: dict = {} @@ -75,7 +76,7 @@ def create_client(self, client_type: str): class DataProcessingWorker: - def __init__(self, host: str, deploy_type: str, processor_name: str) -> None: + def __init__(self, host: str, deploy_type: DeployType, processor_name: str) -> None: self.host = host self.deploy_type = deploy_type self.processor_name = processor_name @@ -84,7 +85,7 @@ def __init__(self, host: str, deploy_type: str, processor_name: str) -> None: class DataProcessorServer: - def __init__(self, host: str, port: int, deploy_type: str, processor_name: str) -> None: + def __init__(self, host: str, port: int, deploy_type: DeployType, processor_name: str) -> None: self.host = host self.port = port self.deploy_type = deploy_type diff --git a/ocrd_validators/ocrd_validators/processing_server_config.schema.yml b/ocrd_validators/ocrd_validators/processing_server_config.schema.yml index b2fe06cb49..c9722f0f37 100644 --- a/ocrd_validators/ocrd_validators/processing_server_config.schema.yml +++ b/ocrd_validators/ocrd_validators/processing_server_config.schema.yml @@ -123,7 +123,7 @@ properties: - ocrd-cis-ocropy-binarize - ocrd-olena-binarize deploy_type: - description: Should the processor server natively or with Docker + description: Should the processor server be deployed natively or with Docker type: string enum: - native From 863ae4a79c27b39d84ee36546bc880f0129e6f5e Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 3 Apr 2023 12:57:05 +0200 Subject: [PATCH 15/34] separate: creation/execution of job task --- ocrd_network/ocrd_network/processing_worker.py | 2 +- ocrd_network/ocrd_network/processor_server.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/ocrd_network/ocrd_network/processing_worker.py b/ocrd_network/ocrd_network/processing_worker.py index 74d21b1f92..5634c08b04 100644 --- a/ocrd_network/ocrd_network/processing_worker.py +++ b/ocrd_network/ocrd_network/processing_worker.py @@ -217,7 +217,7 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: self.log.debug(f"processor_name: {self.processor_name}, path_to_mets: {path_to_mets}, " f"input_grps: {input_file_grps}, output_file_grps: {output_file_grps}, " f"page_id: {page_id}, parameters: {parameters}") - self.log.error(error) + self.log.exception(error) execution_failed = True end_time = datetime.now() exec_duration = calculate_execution_time(start_time, end_time) diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 9d8cf1af12..30b049264e 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -134,6 +134,15 @@ async def create_processor_job_task(self, job_input: PYJobInput, background_task ) await job.insert() + background_tasks.add_task( + self.processor_job_task, + job_id=job_id, + job=job + ) + + return job.to_job_output() + + async def processor_job_task(self, job_id: str, job: DBProcessorJob): execution_failed = False start_time = datetime.now() await db_update_processing_job( @@ -142,8 +151,7 @@ async def create_processor_job_task(self, job_input: PYJobInput, background_task start_time=start_time ) try: - background_tasks.add_task( - run_single_execution, + run_single_execution( ProcessorClass=self.ProcessorClass, executable=self.processor_name, abs_path_to_mets=job.path_to_mets, @@ -156,7 +164,7 @@ async def create_processor_job_task(self, job_input: PYJobInput, background_task self.log.debug(f"processor_name: {self.processor_name}, path_to_mets: {job.path_to_mets}, " f"input_grps: {job.input_file_grps}, output_file_grps: {job.output_file_grps}, " f"page_id: {job.page_id}, parameters: {job.parameters}") - self.log.error(error) + self.log.exception(error) execution_failed = True end_time = datetime.now() exec_duration = calculate_execution_time(start_time, end_time) @@ -167,7 +175,6 @@ async def create_processor_job_task(self, job_input: PYJobInput, background_task end_time=end_time, exec_time=f'{exec_duration} ms' ) - return job.to_job_output() def get_ocrd_tool(self): if self.ocrd_tool: From 8e59e1cf6dce5bfdaf34bd73227f1daff7dcfa17 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 3 Apr 2023 13:09:30 +0200 Subject: [PATCH 16/34] add background tasks caveat --- ocrd_network/ocrd_network/processor_server.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 30b049264e..b980ad11c4 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -134,6 +134,16 @@ async def create_processor_job_task(self, job_input: PYJobInput, background_task ) await job.insert() + # TODO: Background tasks solution was just adopted from #884, + # but seems to not suit what we are trying to achieve... + # However, using Celery or RabbitMQ, takes away the point + # of having REST API Processor Server, or does it? + + # FastAPI Caveat: If you need to perform heavy background + # computation and you don't necessarily need it to be run + # by the same process (for example, you don't need to share + # memory, variables, etc), you might benefit from using + # other bigger tools like Celery. background_tasks.add_task( self.processor_job_task, job_id=job_id, From dbf1af6614782d8b5dc4f56ca54c6a36a75f4da6 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 3 Apr 2023 13:36:05 +0200 Subject: [PATCH 17/34] change tool/version read --- ocrd_network/ocrd_network/processor_server.py | 40 ++++++++++++++----- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index b980ad11c4..eea6bc1ac8 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -6,7 +6,11 @@ from fastapi import FastAPI, HTTPException, status, BackgroundTasks -from ocrd_utils import getLogger, get_ocrd_tool_json +from ocrd_utils import ( + get_ocrd_tool_json, + getLogger, + parse_json_string_with_comments, +) from .database import ( DBProcessorJob, db_update_processing_job, @@ -144,6 +148,9 @@ async def create_processor_job_task(self, job_input: PYJobInput, background_task # by the same process (for example, you don't need to share # memory, variables, etc), you might benefit from using # other bigger tools like Celery. + + # Check here as well: + # 1) https://github.com/tiangolo/fastapi/discussions/8666 background_tasks.add_task( self.processor_job_task, job_id=job_id, @@ -190,7 +197,16 @@ def get_ocrd_tool(self): if self.ocrd_tool: return self.ocrd_tool if self.ProcessorClass: - ocrd_tool = self.ProcessorClass(workspace=None, version=True).ocrd_tool + # The way of accessing ocrd tool like in the line below may be problematic + # ocrd_tool = self.ProcessorClass(workspace=None, version=True).ocrd_tool + ocrd_tool = parse_json_string_with_comments( + run( + [self.processor_name, '--dump-json'], + stdout=PIPE, + check=True, + universal_newlines=True + ).stdout + ) else: ocrd_tool = get_ocrd_tool_json(self.processor_name) return ocrd_tool @@ -198,15 +214,19 @@ def get_ocrd_tool(self): def get_version(self) -> str: if self.version: return self.version + + """ if self.ProcessorClass: - version_str = self.ProcessorClass(workspace=None, version=True).version - else: - version_str = run( - [self.processor_name, '--version'], - stdout=PIPE, - check=True, - universal_newlines=True - ).stdout + # The way of accessing the version like in the line below may be problematic + # version_str = self.ProcessorClass(workspace=None, version=True).version + return version_str + """ + version_str = run( + [self.processor_name, '--version'], + stdout=PIPE, + check=True, + universal_newlines=True + ).stdout return version_str def run_server(self, host, port, access_log=False): From c31c5b76517789c6c432d063c31d11f29d5e77b9 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 3 Apr 2023 14:01:31 +0200 Subject: [PATCH 18/34] combine network cli under a group --- ocrd/ocrd/cli/__init__.py | 8 +-- ocrd/ocrd/cli/network.py | 132 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 6 deletions(-) create mode 100644 ocrd/ocrd/cli/network.py diff --git a/ocrd/ocrd/cli/__init__.py b/ocrd/ocrd/cli/__init__.py index 5d00706404..cc75f63108 100644 --- a/ocrd/ocrd/cli/__init__.py +++ b/ocrd/ocrd/cli/__init__.py @@ -31,9 +31,7 @@ def get_help(self, ctx): from ocrd.decorators import ocrd_loglevel from .zip import zip_cli from .log import log_cli -from .processing_server import processing_server_cli -from .processing_worker import processing_worker_cli -from .processor_server import processor_server_cli +from .network import network_cli @click.group() @@ -52,6 +50,4 @@ def cli(**kwargs): # pylint: disable=unused-argument cli.add_command(validate_cli) cli.add_command(log_cli) cli.add_command(resmgr_cli) -cli.add_command(processing_server_cli) -cli.add_command(processing_worker_cli) -cli.add_command(processor_server_cli) +cli.add_command(network_cli) diff --git a/ocrd/ocrd/cli/network.py b/ocrd/ocrd/cli/network.py new file mode 100644 index 0000000000..e2b373f64c --- /dev/null +++ b/ocrd/ocrd/cli/network.py @@ -0,0 +1,132 @@ +""" +OCR-D CLI: management of network components + +.. click:: ocrd.cli.network:network_cli + :prog: ocrd network + :nested: full +""" + +import click +import logging +from ocrd_utils import ( + initLogging, + get_ocrd_tool_json +) +from ocrd_network import ( + DatabaseParamType, + ProcessingServer, + ProcessingWorker, + ProcessorServer, + ServerAddressParamType, + QueueServerParamType, +) + + +@click.group("network") +def network_cli(): + """ + Managing network components + """ + initLogging() + # TODO: Remove after the logging fix in core + logging.getLogger('paramiko.transport').setLevel(logging.INFO) + logging.getLogger('ocrd.network').setLevel(logging.DEBUG) + + +@network_cli.command('processing-server') +@click.argument('path_to_config', required=True, type=click.STRING) +@click.option('-a', '--address', + default="localhost:8080", + help='The URL of the Processing server, format: host:port', + type=ServerAddressParamType(), + required=True) +def processing_server_cli(path_to_config, address: str): + """ + Start the Processing Server + (proxy between the user and the + Processing Worker(s) / Processor Server(s)) + """ + + # Note, the address is already validated with the type field + host, port = address.split(':') + processing_server = ProcessingServer(path_to_config, host, port) + processing_server.start() + + +@network_cli.command('processor-server') +@click.argument('processor_name', required=True, type=click.STRING) +@click.option('-a', '--address', + help='The URL of the processor server, format: host:port', + type=ServerAddressParamType(), + required=True) +@click.option('-d', '--database', + default="mongodb://localhost:27018", + help='The URL of the MongoDB, format: mongodb://host:port', + type=DatabaseParamType(), + required=True) +def processor_server_cli(processor_name: str, address: str, database: str): + """ + Start Processor Server + (standalone REST API OCR-D processor) + """ + initLogging() + # TODO: Remove before the release + logging.getLogger('ocrd.network').setLevel(logging.DEBUG) + + try: + # TODO: Better validate that inside the ProcessorServer itself + host, port = address.split(':') + processor_server = ProcessorServer( + mongodb_addr=database, + processor_name=processor_name, + processor_class=None, # For readability purposes assigned here + ) + processor_server.run_server(host=host, port=int(port)) + except Exception as e: + raise Exception("Processor server has failed with error") from e + + +@network_cli.command('processing-worker') +@click.argument('processor_name', required=True, type=click.STRING) +@click.option('-q', '--queue', + default="amqp://admin:admin@localhost:5672/", + help='The URL of the Queue Server, format: amqp://username:password@host:port/vhost', + type=QueueServerParamType(), + required=True) +@click.option('-d', '--database', + default="mongodb://localhost:27018", + help='The URL of the MongoDB, format: mongodb://host:port', + type=DatabaseParamType(), + required=True) +def processing_worker_cli(processor_name: str, queue: str, database: str): + """ + Start Processing Worker + (a specific ocr-d processor consuming tasks from RabbitMQ queue) + """ + initLogging() + # TODO: Remove before the release + logging.getLogger('ocrd.network').setLevel(logging.DEBUG) + + # Get the ocrd_tool dictionary + # ocrd_tool = parse_json_string_with_comments( + # run([processor_name, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True).stdout + # ) + + ocrd_tool = get_ocrd_tool_json(processor_name) + if not ocrd_tool: + raise Exception(f"The ocrd_tool is empty or missing") + + try: + processing_worker = ProcessingWorker( + rabbitmq_addr=queue, + mongodb_addr=database, + processor_name=ocrd_tool['executable'], + ocrd_tool=ocrd_tool, + processor_class=None, # For readability purposes assigned here + ) + # The RMQConsumer is initialized and a connection to the RabbitMQ is performed + processing_worker.connect_consumer() + # Start consuming from the queue with name `processor_name` + processing_worker.start_consuming() + except Exception as e: + raise Exception("Processing worker has failed with error") from e From 93482d39e50ded9046cae09e0bc48969ab8d28e4 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 3 Apr 2023 15:11:42 +0200 Subject: [PATCH 19/34] remove unnecessary files --- ocrd/ocrd/cli/processing_server.py | 41 ------------------- ocrd/ocrd/cli/processing_worker.py | 63 ------------------------------ ocrd/ocrd/cli/processor_server.py | 47 ---------------------- 3 files changed, 151 deletions(-) delete mode 100644 ocrd/ocrd/cli/processing_server.py delete mode 100644 ocrd/ocrd/cli/processing_worker.py delete mode 100644 ocrd/ocrd/cli/processor_server.py diff --git a/ocrd/ocrd/cli/processing_server.py b/ocrd/ocrd/cli/processing_server.py deleted file mode 100644 index 3f1d50fae9..0000000000 --- a/ocrd/ocrd/cli/processing_server.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -OCR-D CLI: start the processing server - -.. click:: ocrd.cli.processing_server:processing_server_cli - :prog: ocrd processing-server - :nested: full -""" -import click -import logging -from ocrd_utils import initLogging -from ocrd_network import ( - ProcessingServer, - ServerAddressParamType, -) - - -@click.command('processing-server') -@click.argument('path_to_config', required=True, type=click.STRING) -@click.option('-a', '--address', - default="localhost:8080", - help='The URL of the Processing server, format: host:port', - type=ServerAddressParamType(), - required=True) -def processing_server_cli(path_to_config, address: str): - """ - Start and manage processing workers with the processing server - - PATH_TO_CONFIG is a yaml file to configure the server and the workers. See - https://github.com/OCR-D/spec/pull/222/files#diff-a71bf71cbc7d9ce94fded977f7544aba4df9e7bdb8fc0cf1014e14eb67a9b273 - for further information (TODO: update path when spec is available/merged) - - """ - initLogging() - # TODO: Remove before the release - logging.getLogger('paramiko.transport').setLevel(logging.INFO) - logging.getLogger('ocrd.network').setLevel(logging.DEBUG) - - # Note, the address is already validated with the type field - host, port = address.split(':') - processing_server = ProcessingServer(path_to_config, host, port) - processing_server.start() diff --git a/ocrd/ocrd/cli/processing_worker.py b/ocrd/ocrd/cli/processing_worker.py deleted file mode 100644 index f9b5c8f1bc..0000000000 --- a/ocrd/ocrd/cli/processing_worker.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -OCR-D CLI: start the processing worker - -.. click:: ocrd.cli.processing_worker:processing_worker_cli - :prog: ocrd processing-worker - :nested: full -""" -import click -import logging -from ocrd_utils import ( - initLogging, - get_ocrd_tool_json -) -from ocrd_network import ( - DatabaseParamType, - ProcessingWorker, - QueueServerParamType, -) - - -@click.command('processing-worker') -@click.argument('processor_name', required=True, type=click.STRING) -@click.option('-q', '--queue', - default="amqp://admin:admin@localhost:5672/", - help='The URL of the Queue Server, format: amqp://username:password@host:port/vhost', - type=QueueServerParamType(), - required=True) -@click.option('-d', '--database', - default="mongodb://localhost:27018", - help='The URL of the MongoDB, format: mongodb://host:port', - type=DatabaseParamType(), - required=True) -def processing_worker_cli(processor_name: str, queue: str, database: str): - """ - Start a processing worker (a specific ocr-d processor) - """ - initLogging() - # TODO: Remove before the release - logging.getLogger('ocrd.network').setLevel(logging.DEBUG) - - # Get the ocrd_tool dictionary - # ocrd_tool = parse_json_string_with_comments( - # run([processor_name, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True).stdout - # ) - - ocrd_tool = get_ocrd_tool_json(processor_name) - if not ocrd_tool: - raise Exception(f"The ocrd_tool is empty or missing") - - try: - processing_worker = ProcessingWorker( - rabbitmq_addr=queue, - mongodb_addr=database, - processor_name=ocrd_tool['executable'], - ocrd_tool=ocrd_tool, - processor_class=None, # For readability purposes assigned here - ) - # The RMQConsumer is initialized and a connection to the RabbitMQ is performed - processing_worker.connect_consumer() - # Start consuming from the queue with name `processor_name` - processing_worker.start_consuming() - except Exception as e: - raise Exception("Processing worker has failed with error") from e diff --git a/ocrd/ocrd/cli/processor_server.py b/ocrd/ocrd/cli/processor_server.py deleted file mode 100644 index ef45f61aa1..0000000000 --- a/ocrd/ocrd/cli/processor_server.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -OCR-D CLI: start the processor server - -.. click:: ocrd.cli.processor_server:processor_server_cli - :prog: ocrd processor-server - :nested: full -""" -import click -import logging -from ocrd_utils import initLogging -from ocrd_network import ( - DatabaseParamType, - ServerAddressParamType, - ProcessorServer, -) - - -@click.command('processor-server') -@click.argument('processor_name', required=True, type=click.STRING) -@click.option('-a', '--address', - help='The URL of the processor server, format: host:port', - type=ServerAddressParamType(), - required=True) -@click.option('-d', '--database', - default="mongodb://localhost:27018", - help='The URL of the MongoDB, format: mongodb://host:port', - type=DatabaseParamType(), - required=True) -def processor_server_cli(processor_name: str, address: str, database: str): - """ - Start ocr-d processor as a server - """ - initLogging() - # TODO: Remove before the release - logging.getLogger('ocrd.network').setLevel(logging.DEBUG) - - try: - # TODO: Better validate that inside the ProcessorServer itself - host, port = address.split(':') - processor_server = ProcessorServer( - mongodb_addr=database, - processor_name=processor_name, - processor_class=None, # For readability purposes assigned here - ) - processor_server.run_server(host=host, port=int(port)) - except Exception as e: - raise Exception("Processor server has failed with error") from e From f5c34f991bcb5c9c04d405230054ff26d79ac3ed Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 3 Apr 2023 16:20:16 +0200 Subject: [PATCH 20/34] clean network cli --- ocrd/ocrd/cli/network.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ocrd/ocrd/cli/network.py b/ocrd/ocrd/cli/network.py index e2b373f64c..00e3affe8e 100644 --- a/ocrd/ocrd/cli/network.py +++ b/ocrd/ocrd/cli/network.py @@ -69,10 +69,6 @@ def processor_server_cli(processor_name: str, address: str, database: str): Start Processor Server (standalone REST API OCR-D processor) """ - initLogging() - # TODO: Remove before the release - logging.getLogger('ocrd.network').setLevel(logging.DEBUG) - try: # TODO: Better validate that inside the ProcessorServer itself host, port = address.split(':') @@ -103,9 +99,6 @@ def processing_worker_cli(processor_name: str, queue: str, database: str): Start Processing Worker (a specific ocr-d processor consuming tasks from RabbitMQ queue) """ - initLogging() - # TODO: Remove before the release - logging.getLogger('ocrd.network').setLevel(logging.DEBUG) # Get the ocrd_tool dictionary # ocrd_tool = parse_json_string_with_comments( From 7bf5e78fd32644cf57d880da93a3c548d6685524 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 4 Apr 2023 13:03:39 +0200 Subject: [PATCH 21/34] abstract network cli in ocrd_network --- ocrd/ocrd/cli/network.py | 113 ++---------------- ocrd_network/ocrd_network/__init__.py | 1 + ocrd_network/ocrd_network/cli/__init__.py | 11 ++ ocrd_network/ocrd_network/cli/client.py | 28 +++++ .../ocrd_network/cli/processing_server.py | 25 ++++ .../ocrd_network/cli/processing_worker.py | 51 ++++++++ .../ocrd_network/cli/processor_server.py | 35 ++++++ 7 files changed, 162 insertions(+), 102 deletions(-) create mode 100644 ocrd_network/ocrd_network/cli/__init__.py create mode 100644 ocrd_network/ocrd_network/cli/client.py create mode 100644 ocrd_network/ocrd_network/cli/processing_server.py create mode 100644 ocrd_network/ocrd_network/cli/processing_worker.py create mode 100644 ocrd_network/ocrd_network/cli/processor_server.py diff --git a/ocrd/ocrd/cli/network.py b/ocrd/ocrd/cli/network.py index 00e3affe8e..f20f965851 100644 --- a/ocrd/ocrd/cli/network.py +++ b/ocrd/ocrd/cli/network.py @@ -8,17 +8,12 @@ import click import logging -from ocrd_utils import ( - initLogging, - get_ocrd_tool_json -) -from ocrd_network import ( - DatabaseParamType, - ProcessingServer, - ProcessingWorker, - ProcessorServer, - ServerAddressParamType, - QueueServerParamType, +from ocrd_utils import initLogging +from ocrd_network.cli import ( + client_cli, + processing_server_cli, + processing_worker_cli, + processor_server_cli, ) @@ -30,96 +25,10 @@ def network_cli(): initLogging() # TODO: Remove after the logging fix in core logging.getLogger('paramiko.transport').setLevel(logging.INFO) - logging.getLogger('ocrd.network').setLevel(logging.DEBUG) - - -@network_cli.command('processing-server') -@click.argument('path_to_config', required=True, type=click.STRING) -@click.option('-a', '--address', - default="localhost:8080", - help='The URL of the Processing server, format: host:port', - type=ServerAddressParamType(), - required=True) -def processing_server_cli(path_to_config, address: str): - """ - Start the Processing Server - (proxy between the user and the - Processing Worker(s) / Processor Server(s)) - """ - - # Note, the address is already validated with the type field - host, port = address.split(':') - processing_server = ProcessingServer(path_to_config, host, port) - processing_server.start() - - -@network_cli.command('processor-server') -@click.argument('processor_name', required=True, type=click.STRING) -@click.option('-a', '--address', - help='The URL of the processor server, format: host:port', - type=ServerAddressParamType(), - required=True) -@click.option('-d', '--database', - default="mongodb://localhost:27018", - help='The URL of the MongoDB, format: mongodb://host:port', - type=DatabaseParamType(), - required=True) -def processor_server_cli(processor_name: str, address: str, database: str): - """ - Start Processor Server - (standalone REST API OCR-D processor) - """ - try: - # TODO: Better validate that inside the ProcessorServer itself - host, port = address.split(':') - processor_server = ProcessorServer( - mongodb_addr=database, - processor_name=processor_name, - processor_class=None, # For readability purposes assigned here - ) - processor_server.run_server(host=host, port=int(port)) - except Exception as e: - raise Exception("Processor server has failed with error") from e - - -@network_cli.command('processing-worker') -@click.argument('processor_name', required=True, type=click.STRING) -@click.option('-q', '--queue', - default="amqp://admin:admin@localhost:5672/", - help='The URL of the Queue Server, format: amqp://username:password@host:port/vhost', - type=QueueServerParamType(), - required=True) -@click.option('-d', '--database', - default="mongodb://localhost:27018", - help='The URL of the MongoDB, format: mongodb://host:port', - type=DatabaseParamType(), - required=True) -def processing_worker_cli(processor_name: str, queue: str, database: str): - """ - Start Processing Worker - (a specific ocr-d processor consuming tasks from RabbitMQ queue) - """ - - # Get the ocrd_tool dictionary - # ocrd_tool = parse_json_string_with_comments( - # run([processor_name, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True).stdout - # ) + logging.getLogger('ocrd_network').setLevel(logging.DEBUG) - ocrd_tool = get_ocrd_tool_json(processor_name) - if not ocrd_tool: - raise Exception(f"The ocrd_tool is empty or missing") - try: - processing_worker = ProcessingWorker( - rabbitmq_addr=queue, - mongodb_addr=database, - processor_name=ocrd_tool['executable'], - ocrd_tool=ocrd_tool, - processor_class=None, # For readability purposes assigned here - ) - # The RMQConsumer is initialized and a connection to the RabbitMQ is performed - processing_worker.connect_consumer() - # Start consuming from the queue with name `processor_name` - processing_worker.start_consuming() - except Exception as e: - raise Exception("Processing worker has failed with error") from e +network_cli.add_command(client_cli) +network_cli.add_command(processing_server_cli) +network_cli.add_command(processing_worker_cli) +network_cli.add_command(processor_server_cli) diff --git a/ocrd_network/ocrd_network/__init__.py b/ocrd_network/ocrd_network/__init__.py index 840864fed8..aaeeba7fcf 100644 --- a/ocrd_network/ocrd_network/__init__.py +++ b/ocrd_network/ocrd_network/__init__.py @@ -22,6 +22,7 @@ # Note: The Mets Server is still not placed on the architecture diagram and probably won't be a part of # the network package. The reason, Mets Server is tightly coupled with the `OcrdWorkspace`. +from .client import Client from .processing_server import ProcessingServer from .processing_worker import ProcessingWorker from .processor_server import ProcessorServer diff --git a/ocrd_network/ocrd_network/cli/__init__.py b/ocrd_network/ocrd_network/cli/__init__.py new file mode 100644 index 0000000000..1704b2aaf7 --- /dev/null +++ b/ocrd_network/ocrd_network/cli/__init__.py @@ -0,0 +1,11 @@ +from .client import client_cli +from .processing_server import processing_server_cli +from .processing_worker import processing_worker_cli +from .processor_server import processor_server_cli + +__all__ = [ + 'client_cli', + 'processing_server_cli', + 'processing_worker_cli', + 'processor_server_cli' +] diff --git a/ocrd_network/ocrd_network/cli/client.py b/ocrd_network/ocrd_network/cli/client.py new file mode 100644 index 0000000000..d3302177de --- /dev/null +++ b/ocrd_network/ocrd_network/cli/client.py @@ -0,0 +1,28 @@ +import click + + +@click.group('client') +def client_cli(): + """ + A client for interacting with the network modules + """ + + +@client_cli.group('discovery') +def client_cli_discovery(): + pass + + +@client_cli.group('workflow') +def client_cli_workflow(): + pass + + +@client_cli.group('workspace') +def client_cli_workspace(): + pass + + +@client_cli.group('processing') +def client_cli_processing(): + pass diff --git a/ocrd_network/ocrd_network/cli/processing_server.py b/ocrd_network/ocrd_network/cli/processing_server.py new file mode 100644 index 0000000000..cf2aacab4a --- /dev/null +++ b/ocrd_network/ocrd_network/cli/processing_server.py @@ -0,0 +1,25 @@ +import click +from .. import ( + ProcessingServer, + ServerAddressParamType +) + + +@click.command('processing-server') +@click.argument('path_to_config', required=True, type=click.STRING) +@click.option('-a', '--address', + default="localhost:8080", + help='The URL of the Processing server, format: host:port', + type=ServerAddressParamType(), + required=True) +def processing_server_cli(path_to_config, address: str): + """ + Start the Processing Server + (proxy between the user and the + Processing Worker(s) / Processor Server(s)) + """ + + # Note, the address is already validated with the type field + host, port = address.split(':') + processing_server = ProcessingServer(path_to_config, host, port) + processing_server.start() diff --git a/ocrd_network/ocrd_network/cli/processing_worker.py b/ocrd_network/ocrd_network/cli/processing_worker.py new file mode 100644 index 0000000000..1cc3377382 --- /dev/null +++ b/ocrd_network/ocrd_network/cli/processing_worker.py @@ -0,0 +1,51 @@ +import click +from ocrd_utils import get_ocrd_tool_json + +from .. import ( + DatabaseParamType, + ProcessingWorker, + QueueServerParamType +) + + +@click.command('processing-worker') +@click.argument('processor_name', required=True, type=click.STRING) +@click.option('-q', '--queue', + default="amqp://admin:admin@localhost:5672/", + help='The URL of the Queue Server, format: amqp://username:password@host:port/vhost', + type=QueueServerParamType(), + required=True) +@click.option('-d', '--database', + default="mongodb://localhost:27018", + help='The URL of the MongoDB, format: mongodb://host:port', + type=DatabaseParamType(), + required=True) +def processing_worker_cli(processor_name: str, queue: str, database: str): + """ + Start Processing Worker + (a specific ocr-d processor consuming tasks from RabbitMQ queue) + """ + + # Get the ocrd_tool dictionary + # ocrd_tool = parse_json_string_with_comments( + # run([processor_name, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True).stdout + # ) + + ocrd_tool = get_ocrd_tool_json(processor_name) + if not ocrd_tool: + raise Exception(f"The ocrd_tool is empty or missing") + + try: + processing_worker = ProcessingWorker( + rabbitmq_addr=queue, + mongodb_addr=database, + processor_name=ocrd_tool['executable'], + ocrd_tool=ocrd_tool, + processor_class=None, # For readability purposes assigned here + ) + # The RMQConsumer is initialized and a connection to the RabbitMQ is performed + processing_worker.connect_consumer() + # Start consuming from the queue with name `processor_name` + processing_worker.start_consuming() + except Exception as e: + raise Exception("Processing worker has failed with error") from e diff --git a/ocrd_network/ocrd_network/cli/processor_server.py b/ocrd_network/ocrd_network/cli/processor_server.py new file mode 100644 index 0000000000..534a9a0fed --- /dev/null +++ b/ocrd_network/ocrd_network/cli/processor_server.py @@ -0,0 +1,35 @@ +import click +from .. import ( + DatabaseParamType, + ProcessorServer, + ServerAddressParamType +) + + +@click.command('processor-server') +@click.argument('processor_name', required=True, type=click.STRING) +@click.option('-a', '--address', + help='The URL of the processor server, format: host:port', + type=ServerAddressParamType(), + required=True) +@click.option('-d', '--database', + default="mongodb://localhost:27018", + help='The URL of the MongoDB, format: mongodb://host:port', + type=DatabaseParamType(), + required=True) +def processor_server_cli(processor_name: str, address: str, database: str): + """ + Start Processor Server + (standalone REST API OCR-D processor) + """ + try: + # TODO: Better validate that inside the ProcessorServer itself + host, port = address.split(':') + processor_server = ProcessorServer( + mongodb_addr=database, + processor_name=processor_name, + processor_class=None, # For readability purposes assigned here + ) + processor_server.run_server(host=host, port=int(port)) + except Exception as e: + raise Exception("Processor server has failed with error") from e From 5ee190353498eb7ab63e576866269cbdc9379a93 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 4 Apr 2023 15:41:55 +0200 Subject: [PATCH 22/34] implement skeleton for client --- ocrd_network/ocrd_network/cli/client.py | 89 ++++++++++++++++++++++--- ocrd_network/ocrd_network/client.py | 37 ++++++++++ ocrd_network/ocrd_network/models/job.py | 2 +- 3 files changed, 119 insertions(+), 9 deletions(-) create mode 100644 ocrd_network/ocrd_network/client.py diff --git a/ocrd_network/ocrd_network/cli/client.py b/ocrd_network/ocrd_network/cli/client.py index d3302177de..0af0709924 100644 --- a/ocrd_network/ocrd_network/cli/client.py +++ b/ocrd_network/ocrd_network/cli/client.py @@ -1,28 +1,101 @@ import click +from typing import Optional + +from ocrd.decorators import ( + parameter_option, + parameter_override_option +) +from ocrd_network import Client @click.group('client') def client_cli(): """ - A client for interacting with the network modules + A client for interacting with the network modules. + The client CLI mimics the WebAPI endpoints """ + pass @client_cli.group('discovery') -def client_cli_discovery(): +def discovery_cli(): + """ + The discovery endpoint of the WebAPI + """ pass -@client_cli.group('workflow') -def client_cli_workflow(): +@client_cli.group('processing') +def processing_cli(): + """ + The processing endpoint of the WebAPI + """ pass -@client_cli.group('workspace') -def client_cli_workspace(): +@processing_cli.command('processor') +@click.argument('processor_name', required=True, type=click.STRING) +@click.option('--address') +@click.option('-m', '--mets', required=True, default="mets.xml") +@click.option('-I', '--input-file-grp', default='OCR-D-INPUT') +@click.option('-O', '--output-file-grp', default='OCR-D-OUTPUT') +@click.option('-g', '--page-id') +@parameter_option +@click.option('--result-queue-name') +@click.option('--callback-url') +@click.option('--agent-type', default='worker') +def send_processing_request( + address: Optional[str], + processor_name: str, + mets: str, + input_file_grp: str, + output_file_grp: Optional[str], + page_id: Optional[str], + parameter: Optional[dict], + result_queue_name: Optional[str], + callback_url: Optional[str], + # TODO: This is temporally available to toggle + # between the ProcessingWorker/ProcessorServer + agent_type: Optional[str] +): + req_params = { + "path_to_mets": mets, + "description": "OCR-D Network client request", + "input_file_grps": input_file_grp.split(','), + "parameters": parameter if parameter else {}, + "agent_type": agent_type, + } + if output_file_grp: + req_params["output_file_grps"] = output_file_grp.split(',') + if page_id: + req_params["page_id"] = page_id + if result_queue_name: + req_params["result_queue_name"] = result_queue_name + if callback_url: + req_params["callback_url"] = callback_url + + client = Client( + server_addr_processing=address + ) + response = client.send_processing_request( + processor_name=processor_name, + req_params=req_params + ) + processing_job_id = response.get('job_id', None) + print(f"Processing job id: {processing_job_id}") + + +@client_cli.group('workflow') +def workflow_cli(): + """ + The workflow endpoint of the WebAPI + """ pass -@client_cli.group('processing') -def client_cli_processing(): +@client_cli.group('workspace') +def workspace_cli(): + """ + The workspace endpoint of the WebAPI + """ pass diff --git a/ocrd_network/ocrd_network/client.py b/ocrd_network/ocrd_network/client.py new file mode 100644 index 0000000000..d237908f70 --- /dev/null +++ b/ocrd_network/ocrd_network/client.py @@ -0,0 +1,37 @@ +import json +from os import environ +import requests + + +# TODO: This is just a conceptual implementation and first try to +# trigger further discussions on how this should look like. +class Client: + def __init__( + self, + server_addr_processing: str = environ.get('OCRD_NETWORK_SERVER_ADDR_PROCESSING', ''), + server_addr_workflow: str = environ.get('OCRD_NETWORK_SERVER_ADDR_WORKFLOW', ''), + server_addr_workspace: str = environ.get('OCRD_NETWORK_SERVER_ADDR_WORKSPACE', ''), + ): + self.server_addr_processing = server_addr_processing + self.server_addr_workflow = server_addr_workflow + self.server_addr_workspace = server_addr_workspace + + def send_processing_request(self, processor_name: str, req_params: dict): + verify_server_protocol(self.server_addr_processing) + req_url = f'{self.server_addr_processing}/processor/{processor_name}' + req_headers = {"Content-Type": "application/json; charset=utf-8"} + req_json = json.loads(json.dumps(req_params)) + + print(f'Sending processing request to: {req_url}') + response = requests.post(url=req_url, headers=req_headers, json=req_json) + return response.json() + + +def verify_server_protocol(address: str): + protocol_matched = False + for protocol in ['http://', 'https://']: + if address.startswith(protocol): + protocol_matched = True + break + if not protocol_matched: + raise ValueError(f'Wrong/Missing protocol in the server address: {address}') diff --git a/ocrd_network/ocrd_network/models/job.py b/ocrd_network/ocrd_network/models/job.py index 1ccc82fe9b..aa50e6aad8 100644 --- a/ocrd_network/ocrd_network/models/job.py +++ b/ocrd_network/ocrd_network/models/job.py @@ -32,7 +32,7 @@ class PYJobInput(BaseModel): class Config: schema_extra = { 'example': { - 'path': '/path/to/mets.xml', + 'path_to_mets': '/path/to/mets.xml', 'description': 'The description of this execution', 'input_file_grps': ['INPUT_FILE_GROUP'], 'output_file_grps': ['OUTPUT_FILE_GROUP'], From 845bc09f5b1e2299af30a18b456b47cd68dd35db Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 4 Apr 2023 16:33:51 +0200 Subject: [PATCH 23/34] refactor: naming --- ocrd_network/ocrd_network/process_helpers.py | 10 ++++---- .../ocrd_network/processing_server.py | 2 +- .../ocrd_network/processing_worker.py | 8 +++---- ocrd_network/ocrd_network/processor_server.py | 24 +++++++++---------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/ocrd_network/ocrd_network/process_helpers.py b/ocrd_network/ocrd_network/process_helpers.py index 28cd63a341..ea8c776995 100644 --- a/ocrd_network/ocrd_network/process_helpers.py +++ b/ocrd_network/ocrd_network/process_helpers.py @@ -6,8 +6,8 @@ # A wrapper for run_processor() and run_cli() -def run_single_execution( - ProcessorClass, +def invoke_processor( + processor_class, executable: str, abs_path_to_mets: str, input_file_grps: List[str], @@ -15,15 +15,15 @@ def run_single_execution( page_id: str, parameters: dict, ) -> None: - if not (ProcessorClass or executable): + if not (processor_class or executable): raise ValueError(f'Missing processor class and executable') input_file_grps_str = ','.join(input_file_grps) output_file_grps_str = ','.join(output_file_grps) workspace = Resolver().workspace_from_url(abs_path_to_mets) - if ProcessorClass: + if processor_class: try: run_processor( - processorClass=ProcessorClass, + processorClass=processor_class, workspace=workspace, input_file_grp=input_file_grps_str, output_file_grp=output_file_grps_str, diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 528136031a..7c719f0f07 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -317,7 +317,7 @@ async def push_to_processing_queue(self, processor_name: str, job_input: PYJobIn ) return job.to_job_output() - async def push_to_processor_server(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput: + async def push_to_processo_server(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput: ocrd_tool, processor_server_url = self.query_ocrd_tool_json_from_server(processor_name) validate_job_input(self.log, processor_name, ocrd_tool, job_input) job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=False) diff --git a/ocrd_network/ocrd_network/processing_worker.py b/ocrd_network/ocrd_network/processing_worker.py index 5634c08b04..2fd8a1b25b 100644 --- a/ocrd_network/ocrd_network/processing_worker.py +++ b/ocrd_network/ocrd_network/processing_worker.py @@ -24,7 +24,7 @@ sync_db_update_processing_job, ) from .models import StateEnum -from .process_helpers import run_single_execution +from .process_helpers import invoke_processor from .rabbitmq_utils import ( OcrdProcessingMessage, OcrdResultMessage, @@ -72,7 +72,7 @@ def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, self.processor_name = processor_name # The processor class to be used to instantiate the processor # Think of this as a func pointer to the constructor of the respective OCR-D processor - self.ProcessorClass = processor_class + self.processor_class = processor_class # Gets assigned when `connect_consumer` is called on the worker object # Used to consume OcrdProcessingMessage from the queue with name {processor_name} self.rmq_consumer = None @@ -204,8 +204,8 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: start_time=start_time ) try: - run_single_execution( - ProcessorClass=self.ProcessorClass, + invoke_processor( + processor_class=self.processor_class, executable=self.processor_name, abs_path_to_mets=path_to_mets, input_file_grps=input_file_grps, diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index eea6bc1ac8..86297f6e28 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -22,7 +22,7 @@ PYOcrdTool, StateEnum ) -from .process_helpers import run_single_execution +from .process_helpers import invoke_processor from .server_utils import ( _get_processor_job, validate_and_resolve_mets_path, @@ -46,7 +46,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= self.db_url = mongodb_addr self.processor_name = processor_name - self.ProcessorClass = processor_class + self.processor_class = processor_class self.ocrd_tool = None self.version = None @@ -89,7 +89,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= self.router.add_api_route( path='/', - endpoint=self.create_processor_job_task, + endpoint=self.create_processor_task, methods=['POST'], tags=['Processing'], status_code=status.HTTP_202_ACCEPTED, @@ -125,7 +125,7 @@ async def get_processor_info(self): # Note: The Processing server pushes to a queue, while # the Processor Server creates (pushes to) a background task - async def create_processor_job_task(self, job_input: PYJobInput, background_tasks: BackgroundTasks): + async def create_processor_task(self, job_input: PYJobInput, background_tasks: BackgroundTasks): validate_job_input(self.log, self.processor_name, self.ocrd_tool, job_input) job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=True) @@ -152,14 +152,14 @@ async def create_processor_job_task(self, job_input: PYJobInput, background_task # Check here as well: # 1) https://github.com/tiangolo/fastapi/discussions/8666 background_tasks.add_task( - self.processor_job_task, + self.run_processor_task, job_id=job_id, job=job ) return job.to_job_output() - async def processor_job_task(self, job_id: str, job: DBProcessorJob): + async def run_processor_task(self, job_id: str, job: DBProcessorJob): execution_failed = False start_time = datetime.now() await db_update_processing_job( @@ -168,8 +168,8 @@ async def processor_job_task(self, job_id: str, job: DBProcessorJob): start_time=start_time ) try: - run_single_execution( - ProcessorClass=self.ProcessorClass, + invoke_processor( + processor_class=self.processor_class, executable=self.processor_name, abs_path_to_mets=job.path_to_mets, input_file_grps=job.input_file_grps, @@ -196,9 +196,9 @@ async def processor_job_task(self, job_id: str, job: DBProcessorJob): def get_ocrd_tool(self): if self.ocrd_tool: return self.ocrd_tool - if self.ProcessorClass: + if self.processor_class: # The way of accessing ocrd tool like in the line below may be problematic - # ocrd_tool = self.ProcessorClass(workspace=None, version=True).ocrd_tool + # ocrd_tool = self.processor_class(workspace=None, version=True).ocrd_tool ocrd_tool = parse_json_string_with_comments( run( [self.processor_name, '--dump-json'], @@ -216,9 +216,9 @@ def get_version(self) -> str: return self.version """ - if self.ProcessorClass: + if self.processor_class: # The way of accessing the version like in the line below may be problematic - # version_str = self.ProcessorClass(workspace=None, version=True).version + # version_str = self.processor_class(workspace=None, version=True).version return version_str """ version_str = run( From 94cec8ec01cedd0d1a59c6d8aacd826f1bed6f90 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 4 Apr 2023 16:40:29 +0200 Subject: [PATCH 24/34] fix typo --- ocrd_network/ocrd_network/processing_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 7c719f0f07..528136031a 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -317,7 +317,7 @@ async def push_to_processing_queue(self, processor_name: str, job_input: PYJobIn ) return job.to_job_output() - async def push_to_processo_server(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput: + async def push_to_processor_server(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput: ocrd_tool, processor_server_url = self.query_ocrd_tool_json_from_server(processor_name) validate_job_input(self.log, processor_name, ocrd_tool, job_input) job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=False) From b1ce98b861d5bc6cd241a8dc4f899229485db648 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 5 Apr 2023 09:52:01 +0200 Subject: [PATCH 25/34] full cli path of ocrd_network --- ocrd/ocrd/cli/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/cli/__init__.py b/ocrd/ocrd/cli/__init__.py index cc75f63108..ad12237378 100644 --- a/ocrd/ocrd/cli/__init__.py +++ b/ocrd/ocrd/cli/__init__.py @@ -31,7 +31,7 @@ def get_help(self, ctx): from ocrd.decorators import ocrd_loglevel from .zip import zip_cli from .log import log_cli -from .network import network_cli +from ocrd.cli.network import network_cli @click.group() From 4a5200bab0a9f63a9c8879f957b52baf16ef7344 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 5 Apr 2023 10:00:57 +0200 Subject: [PATCH 26/34] full paths network cli --- ocrd/ocrd/cli/network.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ocrd/ocrd/cli/network.py b/ocrd/ocrd/cli/network.py index f20f965851..1033af17cb 100644 --- a/ocrd/ocrd/cli/network.py +++ b/ocrd/ocrd/cli/network.py @@ -9,12 +9,10 @@ import click import logging from ocrd_utils import initLogging -from ocrd_network.cli import ( - client_cli, - processing_server_cli, - processing_worker_cli, - processor_server_cli, -) +from ocrd_network.cli.client import client_cli +from ocrd_network.cli.processing_server import processing_server_cli +from ocrd_network.cli.processing_worker import processing_worker_cli +from ocrd_network.cli.processor_server import processor_server_cli @click.group("network") From 0b1b5517dc9d08e4a06ae286d633291c5eab4cc6 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 5 Apr 2023 10:14:22 +0200 Subject: [PATCH 27/34] setup.py - ocrd_network.cli --- ocrd/ocrd/cli/__init__.py | 2 +- ocrd/ocrd/cli/network.py | 10 ++++++---- ocrd_network/setup.py | 1 + 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ocrd/ocrd/cli/__init__.py b/ocrd/ocrd/cli/__init__.py index ad12237378..cc75f63108 100644 --- a/ocrd/ocrd/cli/__init__.py +++ b/ocrd/ocrd/cli/__init__.py @@ -31,7 +31,7 @@ def get_help(self, ctx): from ocrd.decorators import ocrd_loglevel from .zip import zip_cli from .log import log_cli -from ocrd.cli.network import network_cli +from .network import network_cli @click.group() diff --git a/ocrd/ocrd/cli/network.py b/ocrd/ocrd/cli/network.py index 1033af17cb..f20f965851 100644 --- a/ocrd/ocrd/cli/network.py +++ b/ocrd/ocrd/cli/network.py @@ -9,10 +9,12 @@ import click import logging from ocrd_utils import initLogging -from ocrd_network.cli.client import client_cli -from ocrd_network.cli.processing_server import processing_server_cli -from ocrd_network.cli.processing_worker import processing_worker_cli -from ocrd_network.cli.processor_server import processor_server_cli +from ocrd_network.cli import ( + client_cli, + processing_server_cli, + processing_worker_cli, + processor_server_cli, +) @click.group("network") diff --git a/ocrd_network/setup.py b/ocrd_network/setup.py index f79081fa09..ee0521aaf4 100644 --- a/ocrd_network/setup.py +++ b/ocrd_network/setup.py @@ -19,6 +19,7 @@ install_requires=install_requires, packages=[ 'ocrd_network', + 'ocrd_network.cli', 'ocrd_network.models', 'ocrd_network.rabbitmq_utils' ], From ee61852fb5c659458ce5eecf85781470fd186127 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Fri, 28 Apr 2023 12:12:52 +0200 Subject: [PATCH 28/34] flexible config: either worker/server or both --- .../ocrd_validators/processing_server_config.schema.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ocrd_validators/ocrd_validators/processing_server_config.schema.yml b/ocrd_validators/ocrd_validators/processing_server_config.schema.yml index c9722f0f37..4039e4917f 100644 --- a/ocrd_validators/ocrd_validators/processing_server_config.schema.yml +++ b/ocrd_validators/ocrd_validators/processing_server_config.schema.yml @@ -57,13 +57,16 @@ properties: required: - address - username - - workers - - servers oneOf: - required: - password - required: - path_to_privkey + anyOf: + - required: + - workers + - required: + - servers properties: address: description: The IP address or domain name of the target machine From 03a77bd894af3172844bee1cc3ca963fea8dd3a6 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Fri, 28 Apr 2023 12:13:19 +0200 Subject: [PATCH 29/34] adapt runtime data for worker/server --- ocrd_network/ocrd_network/runtime_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ocrd_network/ocrd_network/runtime_data.py b/ocrd_network/ocrd_network/runtime_data.py index 8986ddb613..8ab9a2896a 100644 --- a/ocrd_network/ocrd_network/runtime_data.py +++ b/ocrd_network/ocrd_network/runtime_data.py @@ -35,10 +35,10 @@ def __init__(self, config: Dict) -> None: self.data_workers: List[DataProcessingWorker] = [] self.data_servers: List[DataProcessorServer] = [] - for worker in config['workers']: + for worker in config.get('workers', []): name = worker['name'] count = worker['number_of_instance'] - deploy_type = DeployType.DOCKER if worker['deploy_type'] == 'docker' else DeployType.NATIVE + deploy_type = DeployType.DOCKER if worker.get('deploy_type', None) == 'docker' else DeployType.NATIVE if not self.needs_ssh and deploy_type == DeployType.NATIVE: self.needs_ssh = True if not self.needs_docker and deploy_type == DeployType.DOCKER: @@ -46,10 +46,10 @@ def __init__(self, config: Dict) -> None: for _ in range(count): self.data_workers.append(DataProcessingWorker(self.address, deploy_type, name)) - for server in config['servers']: + for server in config.get('servers', []): name = server['name'] port = server['port'] - deploy_type = DeployType.DOCKER if server['deploy_type'] == 'docker' else DeployType.NATIVE + deploy_type = DeployType.DOCKER if server.get('deploy_type', None) == 'docker' else DeployType.NATIVE if not self.needs_ssh and deploy_type == DeployType.NATIVE: self.needs_ssh = True if not self.needs_docker and deploy_type == DeployType.DOCKER: From bb83ade9a693aa86d7b60e59cc3866155e3eed8f Mon Sep 17 00:00:00 2001 From: joschrew Date: Fri, 28 Apr 2023 15:47:27 +0200 Subject: [PATCH 30/34] Add type option to bashlib-call and help output --- ocrd/ocrd/lib.bash | 11 +++++------ ocrd/ocrd/processor/helpers.py | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ocrd/ocrd/lib.bash b/ocrd/ocrd/lib.bash index 11417c5706..f4c9298780 100644 --- a/ocrd/ocrd/lib.bash +++ b/ocrd/ocrd/lib.bash @@ -145,18 +145,17 @@ ocrd__parse_argv () { -V|--version) ocrd ocrd-tool "$OCRD_TOOL_JSON" version; exit ;; --queue) ocrd__worker_queue="$2" ; shift ;; --database) ocrd__worker_database="$2" ; shift ;; + --type) ocrd__worker_type="$2" ; shift ;; *) ocrd__raise "Unknown option '$1'" ;; esac shift done - if [ -v ocrd__worker_queue -a -v ocrd__worker_database ]; then - ocrd processing-worker $OCRD_TOOL_NAME --queue "${ocrd__worker_queue}" --database "${ocrd__worker_database}" + if [ -v ocrd__worker_queue -a -v ocrd__worker_database -a -v ocrd__worker_type ]; then + ocrd processing-worker $OCRD_TOOL_NAME --queue "${ocrd__worker_queue}" --database "${ocrd__worker_database}" --type "${ocrd__worker_type}" exit - elif [ -v ocrd__worker_queue ]; then - ocrd__raise "Processing Worker also requires a --database argument" - elif [ -v ocrd__worker_database ]; then - ocrd__raise "Processing Worker also requires a --queue argument" + elif [ -v ocrd__worker_queue -o -v ocrd__worker_database -o -v ocrd__worker_type ]; then + ocrd__raise "For Processing Worker all 3 arguments --database, --queue and --type are required" fi if [[ ! -e "${ocrd__argv[mets_file]}" ]]; then diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index bc3cce6374..aa1af3b7ec 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -279,6 +279,7 @@ def wrap(s): --database The MongoDB server address in format "mongodb://{host}:{port}" [mongodb://localhost:27018] + --type type of processing: either "worker" or "server" Options for information: -C, --show-resource RESNAME Dump the content of processor resource RESNAME @@ -301,7 +302,7 @@ def wrap(s): # Taken from https://github.com/OCR-D/core/pull/884 @freeze_args -@lru_cache(maxsize=environ.get('OCRD_MAX_PROCESSOR_CACHE', 128)) +@lru_cache(maxsize=environ.get('OCRD_MAX_PROCESSOR_CACHE', 128)) def get_cached_processor(parameter: dict, processor_class): """ Call this function to get back an instance of a processor. From 845cbb6eba9e808aec69bb496975ec1d1f05d40a Mon Sep 17 00:00:00 2001 From: joschrew Date: Tue, 2 May 2023 12:32:55 +0200 Subject: [PATCH 31/34] Fix process worker/server bashlib invocation --- ocrd/ocrd/lib.bash | 50 ++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/ocrd/ocrd/lib.bash b/ocrd/ocrd/lib.bash index f4c9298780..b033228d74 100644 --- a/ocrd/ocrd/lib.bash +++ b/ocrd/ocrd/lib.bash @@ -3,14 +3,14 @@ exit 1 ## ### `ocrd__raise` -## +## ## Raise an error and exit. ocrd__raise () { echo >&2 "ERROR: $1"; exit 127 } ## ### `ocrd__log` -## +## ## Delegate logging to `ocrd log` ocrd__log () { local log_level="${ocrd__argv[log_level]:-}" @@ -23,7 +23,7 @@ ocrd__log () { ## ### `ocrd__minversion` -## +## ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { @@ -53,28 +53,28 @@ ocrd__minversion () { } ## ### `ocrd__dumpjson` -## +## ## Output ocrd-tool.json. -## +## ## Requires `$OCRD_TOOL_JSON` and `$OCRD_TOOL_NAME` to be set: -## +## ## ```sh ## export OCRD_TOOL_JSON=/path/to/ocrd-tool.json ## export OCRD_TOOL_NAME=ocrd-foo-bar ## ``` -## +## ocrd__dumpjson () { ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" dump } -## +## ## Output file resource content. ## ocrd__show_resource () { ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" show-resource "$1" } -## +## ## Output file resources names. ## ocrd__list_resources () { @@ -82,9 +82,9 @@ ocrd__list_resources () { } ## ### `ocrd__usage` -## +## ## Print usage -## +## ocrd__usage () { ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" help @@ -92,9 +92,9 @@ ocrd__usage () { } ## ### `ocrd__parse_argv` -## +## ## Expects an associative array ("hash"/"dict") `ocrd__argv` to be defined: -## +## ## ```sh ## declare -A ocrd__argv=() ## ``` @@ -146,16 +146,32 @@ ocrd__parse_argv () { --queue) ocrd__worker_queue="$2" ; shift ;; --database) ocrd__worker_database="$2" ; shift ;; --type) ocrd__worker_type="$2" ; shift ;; + --address) ocrd__worker_address="$2" ; shift ;; *) ocrd__raise "Unknown option '$1'" ;; esac shift done - if [ -v ocrd__worker_queue -a -v ocrd__worker_database -a -v ocrd__worker_type ]; then - ocrd processing-worker $OCRD_TOOL_NAME --queue "${ocrd__worker_queue}" --database "${ocrd__worker_database}" --type "${ocrd__worker_type}" + if [ -v ocrd__worker_queue -o -v ocrd__worker_database -o -v ocrd__worker_type -o -v ocrd__worker_address ]; then + if ! [ -v ocrd__worker_type ] ; then + ocrd__raise "For Processing Worker / Processor Server --type is required" + elif ! [ -v ocrd__worker_database ]; then + ocrd__raise "For the Processing Worker / Processor Server --database is required" + fi + if [ ${ocrd__worker_type} = "worker" ]; then + if ! [ -v ocrd__worker_queue ]; then + ocrd__raise "For the Processing Worker --queue is required" + fi + ocrd network processing-worker $OCRD_TOOL_NAME --queue "${ocrd__worker_queue}" --database "${ocrd__worker_database}" + elif [ ${ocrd__worker_type} = "server" ]; then + if ! [ -v ocrd__worker_address ]; then + ocrd__raise "For the Processor Server --address is required" + fi + ocrd network processor-server $OCRD_TOOL_NAME --database "${ocrd__worker_database}" --address "${ocrd__worker_address}" + else + ocrd__raise "--type must be either 'worker' or 'server' not '${ocrd__worker_type}'" + fi exit - elif [ -v ocrd__worker_queue -o -v ocrd__worker_database -o -v ocrd__worker_type ]; then - ocrd__raise "For Processing Worker all 3 arguments --database, --queue and --type are required" fi if [[ ! -e "${ocrd__argv[mets_file]}" ]]; then From d680e907a55230146acdbf7eab6586c201216dc5 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 8 May 2023 12:55:41 +0200 Subject: [PATCH 32/34] processor server: blocking api --- ocrd_network/ocrd_network/processor_server.py | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 86297f6e28..785b82c61b 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -137,26 +137,7 @@ async def create_processor_task(self, job_input: PYJobInput, background_tasks: B state=StateEnum.queued ) await job.insert() - - # TODO: Background tasks solution was just adopted from #884, - # but seems to not suit what we are trying to achieve... - # However, using Celery or RabbitMQ, takes away the point - # of having REST API Processor Server, or does it? - - # FastAPI Caveat: If you need to perform heavy background - # computation and you don't necessarily need it to be run - # by the same process (for example, you don't need to share - # memory, variables, etc), you might benefit from using - # other bigger tools like Celery. - - # Check here as well: - # 1) https://github.com/tiangolo/fastapi/discussions/8666 - background_tasks.add_task( - self.run_processor_task, - job_id=job_id, - job=job - ) - + await self.run_processor_task(job_id=job_id, job=job) return job.to_job_output() async def run_processor_task(self, job_id: str, job: DBProcessorJob): From 4b5d48ea680238059e4e71d39a63ed63f364d92e Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 8 May 2023 16:37:19 +0200 Subject: [PATCH 33/34] Use async req to processor servers --- .../ocrd_network/processing_server.py | 21 +++++++++++++------ ocrd_network/requirements.txt | 1 + 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 528136031a..0a7b974f28 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -1,5 +1,6 @@ import json import requests +import httpx from typing import Dict, List import uvicorn @@ -329,12 +330,20 @@ async def push_to_processor_server(self, processor_name: str, job_input: PYJobIn status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to json dump the PYJobInput, error: {e}" ) - # Post a processing job to the Processor Server - response = requests.post( - processor_server_url, - headers={'Content-Type': 'application/json'}, - json=json.loads(json_data) - ) + + # TODO: The amount of pages should come as a request input + # currently, use 200 as a default + amount_of_pages = 200 + request_timeout = 20.0 * amount_of_pages # 20 sec timeout per page + # Post a processing job to the Processor Server asynchronously + timeout = httpx.Timeout(timeout=request_timeout, connect=30.0) + async with httpx.AsyncClient(timeout=timeout) as client: + response = await client.post( + processor_server_url, + headers={'Content-Type': 'application/json'}, + json=json.loads(json_data) + ) + if not response.status_code == 202: self.log.exception(f"Failed to post '{processor_name}' job to: {processor_server_url}") raise HTTPException( diff --git a/ocrd_network/requirements.txt b/ocrd_network/requirements.txt index d11fb430f0..4e8986c1bd 100644 --- a/ocrd_network/requirements.txt +++ b/ocrd_network/requirements.txt @@ -4,3 +4,4 @@ docker paramiko pika>=1.2.0 beanie~=1.7 +httpx>=0.22.0 From 0520742a6d107d3c650f49c7c639220d8b40c559 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Jun 2023 12:32:53 +0200 Subject: [PATCH 34/34] :todo: reference discussion on number of pages --- ocrd_network/ocrd_network/processing_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index 0a7b974f28..2a542219f1 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -332,6 +332,7 @@ async def push_to_processor_server(self, processor_name: str, job_input: PYJobIn ) # TODO: The amount of pages should come as a request input + # TODO: cf https://github.com/OCR-D/core/pull/1030/files#r1152551161 # currently, use 200 as a default amount_of_pages = 200 request_timeout = 20.0 * amount_of_pages # 20 sec timeout per page