OCR-D · kba · Sep 4, 2023 · Jul 4, 2023 · Jul 4, 2023 · Jul 4, 2023
diff --git a/ocrd_network/ocrd_network/database.py b/ocrd_network/ocrd_network/database.py
@@ -1,6 +1,6 @@
 """ The database is used to store information regarding jobs and workspaces.
 
-Jobs: for every process-request a job is inserted into the database with a uuid, status and
+Jobs: for every process-request a job is inserted into the database with an uuid, status and
 information about the process like parameters and file groups. It is mainly used to track the status
 (`ocrd_network.models.job.StateEnum`) of a job so that the state of a job can be queried. Finished
 jobs are not deleted from the database.
@@ -35,18 +35,69 @@ async def sync_initiate_database(db_url: str):
     await initiate_database(db_url)
 
 
-async def db_get_workspace(workspace_id: str) -> DBWorkspace:
-    workspace = await DBWorkspace.find_one(
-        DBWorkspace.workspace_id == workspace_id
-    )
+async def db_get_workspace(workspace_id: str = None, workspace_mets_path: str = None) -> DBWorkspace:
+    workspace = None
+    if workspace_id:
+        workspace = await DBWorkspace.find_one(
+            DBWorkspace.workspace_id == workspace_id
+        )
+    if workspace_mets_path:
+        workspace = await DBWorkspace.find_one(
+            DBWorkspace.workspace_mets_path == workspace_mets_path
+        )
     if not workspace:
         raise ValueError(f'Workspace with id "{workspace_id}" not in the DB.')
     return workspace
 
 
 @call_sync
-async def sync_db_get_workspace(workspace_id: str) -> DBWorkspace:
-    return await db_get_workspace(workspace_id)
+async def sync_db_get_workspace(workspace_id: str = None, workspace_mets_path: str = None) -> DBWorkspace:
+    return await db_get_workspace(workspace_id=workspace_id, workspace_mets_path=workspace_mets_path)
+
+
+async def db_update_workspace(workspace_id: str = None, workspace_mets_path: str = None, **kwargs):
+    workspace = None
+    if workspace_id:
+        workspace = await DBWorkspace.find_one(
+            DBWorkspace.workspace_id == workspace_id
+        )
+    if workspace_mets_path:
+        workspace = await DBWorkspace.find_one(
+            DBWorkspace.workspace_mets_path == workspace_mets_path
+        )
+    if not workspace:
+        raise ValueError(f'Workspace with id "{workspace_id}" not in the DB.')
+
+    job_keys = list(workspace.__dict__.keys())
+    for key, value in kwargs.items():
+        if key not in job_keys:
+            raise ValueError(f'Field "{key}" is not available.')
+        if key == 'workspace_id':
+            workspace.workspace_id = value
+        elif key == 'workspace_mets_path':
+            workspace.workspace_mets_path = value
+        elif key == 'ocrd_identifier':
+            workspace.ocrd_identifier = value
+        elif key == 'bagit_profile_identifier':
+            workspace.bagit_profile_identifier = value
+        elif key == 'ocrd_base_version_checksum':
+            workspace.ocrd_base_version_checksum = value
+        elif key == 'ocrd_mets':
+            workspace.ocrd_mets = value
+        elif key == 'bag_info_adds':
+            workspace.bag_info_adds = value
+        elif key == 'deleted':
+            workspace.deleted = value
+        elif key == 'being_processed':
+            workspace.being_processed = value
+        else:
+            raise ValueError(f'Field "{key}" is not updatable.')
+    await workspace.save()
+
+
+@call_sync
+async def sync_db_update_workspace(workspace_id: str = None, workspace_mets_path: str = None, **kwargs):
+    await db_update_workspace(workspace_id=workspace_id, workspace_mets_path=workspace_mets_path, **kwargs)
 
 
 async def db_get_processing_job(job_id: str) -> DBProcessorJob:
@@ -68,8 +119,6 @@ async def db_update_processing_job(job_id: str, **kwargs):
     if not job:
         raise ValueError(f'Processing job with id "{job_id}" not in the DB.')
 
-    # TODO: This may not be the best Pythonic way to do it. However, it works!
-    #  There must be a shorter way with Pydantic. Suggest an improvement.
     job_keys = list(job.__dict__.keys())
     for key, value in kwargs.items():
         if key not in job_keys:

diff --git a/ocrd_network/ocrd_network/models/job.py b/ocrd_network/ocrd_network/models/job.py
@@ -7,9 +7,15 @@
 
 
 class StateEnum(str, Enum):
+    # The processing job is cached inside the Processing Server requests cache
+    cached = 'CACHED'
+    # The processing job is queued inside the RabbitMQ
     queued = 'QUEUED'
+    # Processing job is currently running in a Worker or Processor Server
     running = 'RUNNING'
+    # Processing job finished successfully
     success = 'SUCCESS'
+    # Processing job failed
     failed = 'FAILED'
 
 
@@ -28,6 +34,8 @@ class PYJobInput(BaseModel):
     # Used to toggle between sending requests to 'worker and 'server',
     # i.e., Processing Worker and Processor Server, respectively
     agent_type: Optional[str] = 'worker'
+    # Auto generated by the Processing Server when forwarding to the Processor Server
+    job_id: Optional[str] = None
 
     class Config:
         schema_extra = {
@@ -67,6 +75,7 @@ class DBProcessorJob(Document):
     parameters: Optional[dict]
     result_queue_name: Optional[str]
     callback_url: Optional[str]
+    internal_callback_url: Optional[str]
     start_time: Optional[datetime]
     end_time: Optional[datetime]
     exec_time: Optional[str]

diff --git a/ocrd_network/ocrd_network/models/workspace.py b/ocrd_network/ocrd_network/models/workspace.py
@@ -15,6 +15,8 @@ class DBWorkspace(Document):
         ocrd_mets                   Ocrd-Mets (optional)
         bag_info_adds               bag-info.txt can also (optionally) contain additional
                                     key-value-pairs which are saved here
+        deleted                     the document is deleted if set, however, the record is still preserved
+        being_processed             whether the workspace is currently used in a workflow execution or not
     """
     workspace_id: str
     workspace_mets_path: str
@@ -24,6 +26,7 @@ class DBWorkspace(Document):
     ocrd_mets: Optional[str]
     bag_info_adds: Optional[dict]
     deleted: bool = False
+    being_processed: bool = False
 
     class Settings:
         name = "workspace"
diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py
@@ -3,6 +3,7 @@
 import httpx
 from typing import Dict, List
 import uvicorn
+from queue import Queue
 
 from fastapi import FastAPI, status, Request, HTTPException
 from fastapi.exceptions import RequestValidationError
@@ -11,18 +12,26 @@
 from pika.exceptions import ChannelClosedByBroker
 
 from ocrd_utils import getLogger
-from .database import initiate_database
+from .database import (
+    initiate_database,
+    db_get_workspace,
+    db_update_workspace
+)
 from .deployer import Deployer
 from .models import (
     DBProcessorJob,
     PYJobInput,
     PYJobOutput,
     StateEnum
 )
-from .rabbitmq_utils import RMQPublisher, OcrdProcessingMessage
+from .rabbitmq_utils import (
+    RMQPublisher,
+    OcrdProcessingMessage,
+    OcrdResultMessage
+)
 from .server_utils import (
     _get_processor_job,
-    validate_and_resolve_mets_path,
+    validate_and_return_mets_path,
     validate_job_input,
 )
 from .utils import (
@@ -69,6 +78,11 @@ def __init__(self, config_path: str, host: str, port: int) -> None:
         # Gets assigned when `connect_publisher` is called on the working object
         self.rmq_publisher = None
 
+        # Used for buffering/caching processing requests in the Processing Server
+        # Key: `workspace_id` or `path_to_mets` depending on which is provided
+        # Value: Queue that holds PYInputJob elements
+        self.processing_requests_cache = {}
+
         # Create routes
         self.router.add_api_route(
             path='/stop',
@@ -102,6 +116,15 @@ def __init__(self, config_path: str, host: str, port: int) -> None:
             response_model_exclude_none=True
         )
 
+        self.router.add_api_route(
+            path='/processor/result_callback/{job_id}',
+            endpoint=self.remove_from_request_cache,
+            methods=['POST'],
+            tags=['processing'],
+            status_code=status.HTTP_200_OK,
+            summary='Callback used by a worker or processor server for successful processing of a request',
+        )
+
         self.router.add_api_route(
             path='/processor/{processor_name}',
             endpoint=self.get_processor_info,
@@ -266,16 +289,76 @@ def query_ocrd_tool_json_from_server(self, processor_name):
         return ocrd_tool, processor_server_url
 
     async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJobOutput:
+        if data.job_id:
+            raise HTTPException(
+                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+                detail=f"Job id field is set but must not be: {data.job_id}"
+            )
+        data.job_id = generate_id()  # Generate processing job id
+
         if data.agent_type not in ['worker', 'server']:
             raise HTTPException(
                 status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
                 detail=f"Unknown network agent with value: {data.agent_type}"
             )
+        workspace_db = await db_get_workspace(
+            workspace_id=data.workspace_id,
+            workspace_mets_path=data.path_to_mets
+        )
+        if not workspace_db:
+            raise HTTPException(
+                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+                detail=f"Workspace with id: {data.workspace_id} or path: {data.path_to_mets} not found"
+            )
+
+        # The workspace is currently locked (being processed)
+        # TODO: Do the proper caching here, after the refactored code is working
+        if workspace_db.being_processed:
+            raise HTTPException(
+                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+                detail=f"Workspace with id: {data.workspace_id} or "
+                       f"path: {data.path_to_mets} is currently being processed"
+            )
+
+        workspace_key = data.workspace_id if data.workspace_id else data.path_to_mets
+        # If a record queue of this workspace_id does not exist in the requests cache
+        if not self.processing_requests_cache.get(workspace_key, None):
+            self.processing_requests_cache[workspace_key] = Queue()
+        # Add the processing request to the internal queue
+        self.processing_requests_cache[workspace_key].put(data)
+
+        data = self.processing_requests_cache[workspace_key].get()
+        # Lock the workspace
+        await db_update_workspace(
+            workspace_id=data.workspace_id,
+            workspace_mets_path=data.path_to_mets,
+            being_processed=True
+        )
+
+        # Since the path is not resolved yet,
+        # the return value is not important for the Processing Server
+        await validate_and_return_mets_path(self.log, data)
+
+        # Create a DB entry
+        job = DBProcessorJob(
+            **data.dict(exclude_unset=True, exclude_none=True),
+            processor_name=processor_name,
+            internal_callback_url=f"/processor/result_callback/{data.job_id}",
+            state=StateEnum.queued
+        )
+        await job.insert()
+
         job_output = None
         if data.agent_type == 'worker':
-            job_output = await self.push_to_processing_queue(processor_name, data)
+            ocrd_tool = await self.get_processor_info(processor_name)
+            validate_job_input(self.log, processor_name, ocrd_tool, data)
+            processing_message = self.create_processing_message(job)
+            await self.push_to_processing_queue(processor_name, processing_message)
+            job_output = job.to_job_output()
         if data.agent_type == 'server':
-            job_output = await self.push_to_processor_server(processor_name, data)
+            ocrd_tool, processor_server_url = self.query_ocrd_tool_json_from_server(processor_name)
+            validate_job_input(self.log, processor_name, ocrd_tool, data)
+            job_output = await self.push_to_processor_server(processor_name, processor_server_url, data)
         if not job_output:
             self.log.exception('Failed to create job output')
             raise HTTPException(
@@ -285,10 +368,7 @@ async def push_processor_job(self, processor_name: str, data: PYJobInput) -> PYJ
         return job_output
 
     # TODO: Revisit and remove duplications between push_to_* methods
-    async def push_to_processing_queue(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput:
-        ocrd_tool = await self.get_processor_info(processor_name)
-        validate_job_input(self.log, processor_name, ocrd_tool, job_input)
-        job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=False)
+    async def push_to_processing_queue(self, processor_name: str, processing_message: OcrdProcessingMessage):
         if not self.rmq_publisher:
             raise Exception('RMQPublisher is not connected')
         deployed_processors = self.deployer.find_matching_processors(
@@ -299,14 +379,6 @@ async def push_to_processing_queue(self, processor_name: str, job_input: PYJobIn
         if processor_name not in deployed_processors:
             self.check_if_queue_exists(processor_name)
 
-        job = DBProcessorJob(
-            **job_input.dict(exclude_unset=True, exclude_none=True),
-            job_id=generate_id(),
-            processor_name=processor_name,
-            state=StateEnum.queued
-        )
-        await job.insert()
-        processing_message = self.create_processing_message(job)
         encoded_processing_message = OcrdProcessingMessage.encode_yml(processing_message)
         try:
             self.rmq_publisher.publish_to_queue(processor_name, encoded_processing_message)
@@ -316,12 +388,8 @@ async def push_to_processing_queue(self, processor_name: str, job_input: PYJobIn
                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                 detail=f'RMQPublisher has failed: {error}'
             )
-        return job.to_job_output()
 
-    async def push_to_processor_server(self, processor_name: str, job_input: PYJobInput) -> PYJobOutput:
-        ocrd_tool, processor_server_url = self.query_ocrd_tool_json_from_server(processor_name)
-        validate_job_input(self.log, processor_name, ocrd_tool, job_input)
-        job_input = await validate_and_resolve_mets_path(self.log, job_input, resolve=False)
+    async def push_to_processor_server(self, processor_name: str, processor_server_url: str, job_input: PYJobInput) -> PYJobOutput:
         try:
             json_data = json.dumps(job_input.dict(exclude_unset=True, exclude_none=True))
         except Exception as e:
@@ -357,6 +425,10 @@ async def push_to_processor_server(self, processor_name: str, job_input: PYJobIn
     async def get_processor_job(self, processor_name: str, job_id: str) -> PYJobOutput:
         return await _get_processor_job(self.log, processor_name, job_id)
 
+    async def remove_from_request_cache(self, processor_name: str, job_id: str, ocrd_result: OcrdResultMessage):
+        # TODO: Implement, after the refactored code is working
+        pass
+
     async def get_processor_info(self, processor_name) -> Dict:
         """ Return a processor's ocrd-tool.json
         """