Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft for slim containers #386

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions Makefile-slim
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
export PYTHON ?= python3
VIRTUAL_ENV = $(CURDIR)/venv2
BIN = $(VIRTUAL_ENV)/bin
ACTIVATE_VENV = $(BIN)/activate
OCRD_MODULES = OCRD_CIS OCRD_TESSEROCR
OCRD_CIS = ocrd-cis-ocropy-binarize ocrd-cis-ocropy-dewarp
OCRD_TESSEROCR = ocrd-tesserocr-recognize ocrd-tesserocr-segment-region
PROCESSORS = $(foreach mod,$(OCRD_MODULES),$(foreach proc,$($(mod)), $(proc) ))
DELEGATORS = $(foreach proc,$(PROCESSORS),$(BIN)/$(proc))
OCRD_PS_PORT = 8000

slim-venv: docker-compose.yaml .env $(DELEGATORS) | $(VIRTUAL_ENV)


# create a delegator to the processing server for the processor
$(BIN)/ocrd-%: | $(VIRTUAL_ENV)
@sed -e "s|{{\s*OCRD_SLIM_ENV_PATH\s*}}|$(CURDIR)/.env|" slim-containers-files/delegator_template.py > $@;
@chmod u+x $@


$(VIRTUAL_ENV): $(ACTIVATE_VENV)
. $(ACTIVATE_VENV) && $(MAKE) -C core install

%/bin/activate:
$(PYTHON) -m venv $(subst /bin/activate,,$@)
. $@ && pip install --upgrade pip setuptools wheel

# append the service to docker-compose for a processor
add_proc = sed -e "s/{{\s*processor_name\s*}}/$1/" -e "s/{{\s*processor_group_name\s*}}/\L$2/" \
slim-containers-files/docker-compose.processor.template.yaml >> docker-compose.yaml;
joschrew marked this conversation as resolved.
Show resolved Hide resolved

docker-compose.yaml:
@cat slim-containers-files/docker-compose.template.yaml > docker-compose.yaml
@$(foreach mod,$(OCRD_MODULES),$(foreach proc,$($(mod)),$(call add_proc,$(proc),$(mod))))

.env:
@echo OCRD_PS_PORT=$(OCRD_PS_PORT) >> .env
@echo OCRD_PS_MTU=1300 >> .env
@echo MONGODB_USER=admin >> .env
@echo MONGODB_PASS=admin >> .env
@echo 'MONGODB_URL=mongodb://$${MONGODB_USER}:$${MONGODB_PASS}@ocrd-mongodb:27017' >> .env
@echo RABBITMQ_USER=admin >> .env
@echo RABBITMQ_PASS=admin >> .env
@echo 'RABBITMQ_URL=amqp://$${RABBITMQ_USER}:$${RABBITMQ_PASS}@ocrd-rabbitmq:5672' >> .env

58 changes: 58 additions & 0 deletions slim-containers-files/delegator_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python
import sys
import subprocess
from http.server import BaseHTTPRequestHandler, HTTPServer
from os.path import basename
import re
from pathlib import Path

env_path = "{{ OCRD_SLIM_ENV_PATH }}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use https://github.com/theskumar/python-dotenv for that, so you don't have to parse sh variable assignments?

try:
port = re.search(r"OCRD_PS_PORT=(?P<port>[0-9]+)",
Path(f'{env_path}').read_text()).group('port')
except:
raise ValueError("Variable OCRD_PS_PORT not found in .env")

processing_server_address = f"http://localhost:{port}"
processor_name = basename(sys.argv[0])

STOP_WAITING_SERVER = False


class CallbackReceiver(BaseHTTPRequestHandler):
"""Simple http-server to wait for a processor to finish working.

The OCR-D processor is started with a callback-url which points to this server. The processor
starts working in the background but the call is returned immediately. When the processor is
finished it sends a request to the callback-url. The purpose of this server is to wait for the
processor to finish and print its result.
"""
def do_POST(self):
self.send_response(200)
self.send_header("Content-Type", "text/plain")
self.end_headers()
self.wfile.write("finished".encode("utf-8"))
len = int(self.headers.get("Content-Length", 0))
data = self.rfile.read(len).decode("utf-8")
# TODO: how should the callback-content be handled/printed
print(f"Processor finished: {data}")
global STOP_WAITING_SERVER
STOP_WAITING_SERVER = True


def call_processor_and_wait():
server = HTTPServer(("0.0.0.0", 0), CallbackReceiver)
callback_url = f"http://172.17.0.1:{server.server_address[1]}"
cmd = [
"ocrd", "network", "client", "processing", "processor",
processor_name, "--address", processing_server_address,
"--callback-url", callback_url
]
args = list(sys.argv)
subprocess.run(cmd + args[1:])
while not STOP_WAITING_SERVER:
server.handle_request()


if __name__ == "__main__":
call_processor_and_wait()
14 changes: 14 additions & 0 deletions slim-containers-files/docker-compose.processor.template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

{{ processor_name }}:
extends:
file: slim-containers-files/{{ processor_group_name}}/docker-compose.yaml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So IIUC in the final setup, when we have correct Dockerfiles and compose files in all modules, this will simply become {{ module_name }}/docker-compose.yaml?

service: {{ processor_name }}
command: ocrd network processing-worker --database $MONGODB_URL --queue $RABBITMQ_URL --create-queue {{ processor_name }}
depends_on:
- ocrd-processing-server
- ocrd-mongodb
- ocrd-rabbitmq
# restart: The worker creates its queue but rabbitmq needs a few seconds to be available
Comment on lines +7 to +11
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If timing is an issue, I suggest to change the dependency type:

Suggested change
depends_on:
- ocrd-processing-server
- ocrd-mongodb
- ocrd-rabbitmq
# restart: The worker creates its queue but rabbitmq needs a few seconds to be available
depends_on:
- ocrd-processing-server
ocrd-mongodb:
condition: service_started
ocrd-rabbitmq:
condition: service_started

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The underlining problem is the following: The container for the queue is started and running, but it needs 1-3 seconds that queue creation is possible. But the processing worker tries to create it's queue right away.

This suggestion (service_started) is not working because the queue-service is considered started from docker-compose but it is in reality not ready to be used right away although it is considered started. I have a similar issue in another project and already tried a few things solving this. I think the only solution to this problem from the docker side would be to implement a (manual) health-check for the rabbitmq container. But therefore I'd have to extend the rabbit-mq image which I do not want.

For this PR to function some extension to core is needed anyway. There I want to add an optional queue-creation-timeout to the worker startup so that it waits a few seconds with adding its queue or to try again a few times. But this restart-fix was the fastest way to do that that's why it is here and I agree that it should be removed finally. I will remark this as solved as soon as the needed changes to core are made (which need one change to this PR as well).

restart: on-failure:3
volumes:
- "$PWD/data:/data"
49 changes: 49 additions & 0 deletions slim-containers-files/docker-compose.template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
networks:
default:
driver: bridge
driver_opts:
com.docker.network.driver.mtu: ${OCRD_PS_MTU}

services:
ocrd-processing-server:
build:
context: core
args:
BASE_IMAGE: ubuntu:20.04
ports:
- ${OCRD_PS_PORT}:8000
environment:
MONGODB_USER: ${MONGODB_USER:-admin}
MONGODB_PASS: ${MONGODB_PASS:-admin}
RABBITMQ_USER: ${RABBITMQ_USER:-admin}
RABBITMQ_PASS: ${RABBITMQ_PASS:-admin}
command: |
/bin/bash -c "echo -e \"
process_queue:
address: ocrd-rabbitmq
port: 5672
skip_deployment: true
credentials:
username: ${RABBITMQ_USER}
password: ${RABBITMQ_PASS}
database:
address: ocrd-mongodb
port: 27017
skip_deployment: true
credentials:
username: ${MONGODB_USER}
password: ${MONGODB_PASS}
hosts: []\" > /ocrd-processing-server-config.yaml && \
ocrd network processing-server -a 0.0.0.0:8000 /ocrd-processing-server-config.yaml"

ocrd-mongodb:
image: mongo:latest
environment:
MONGO_INITDB_ROOT_USERNAME: ${MONGODB_USER:-admin}
MONGO_INITDB_ROOT_PASSWORD: ${MONGODB_PASS:-admin}

ocrd-rabbitmq:
image: rabbitmq:3-management
environment:
RABBITMQ_DEFAULT_USER: ${RABBITMQ_USER:-admin}
RABBITMQ_DEFAULT_PASS: ${RABBITMQ_PASS:-admin}
15 changes: 15 additions & 0 deletions slim-containers-files/ocrd_cis/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM ocrd/core:latest AS base
WORKDIR /build-ocrd
# Remove the next RUN, this is only to checkout my branch while the changes are not in core yet
RUN git clone https://github.com/ocr-d/core.git && \
cd core && \
git checkout network-for-slim-prep && \
make install

# Not based on ocrd_cis "original" Dockerfile. That seems out of date and in ocrd_all ocrd_cis is
# simply installed with pip so I do the same here
COPY ocrd_cis/ ./ocrd_cis/
COPY setup.py README.md LICENSE ocrd-tool.json Manifest.in ./
RUN pip install . && rm -rf /build-ocrd
# TODO: install models for ocrd-cis
WORKDIR /data
14 changes: 14 additions & 0 deletions slim-containers-files/ocrd_cis/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
services:
ocrd-cis-ocropy-binarize:
build:
context: ../../ocrd_cis
dockerfile: ../slim-containers-files/ocrd_cis/Dockerfile
command:
ocrd network processing-worker ocrd-cis-ocropy-binarize --database $MONGODB_URL --queue $RABBITMQ_URL --create-queue

ocrd-cis-ocropy-dewarp:
build:
context: ../../ocrd_cis
dockerfile: ../slim-containers-files/ocrd_cis/Dockerfile
command:
ocrd network processing-worker ocrd-cis-ocropy-dewarp --database $MONGODB_URL --queue $RABBITMQ_URL --create-queue
44 changes: 44 additions & 0 deletions slim-containers-files/ocrd_tesserocr/Dockerfile
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this necessary at all? ocrd_tesserocr already contains a suitable Dockerfile...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's just for proof-of-concept, so @joschrew does not need to keep multiple PR in sync.

Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
FROM ocrd/core:latest AS base
WORKDIR /build-ocrd-core
# Remove the next RUN, this is only to checkout my branch while the changes are not in core yet
RUN git clone https://github.com/ocr-d/core.git && \
cd core && \
git checkout network-for-slim-prep && \
make install

# copied from https://github.com/OCR-D/ocrd_tesserocr/blob/master/Dockerfile and modified
ARG VCS_REF
ARG BUILD_DATE
LABEL \
maintainer="https://ocr-d.de/kontakt" \
org.label-schema.vcs-ref=$VCS_REF \
org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \
org.label-schema.build-date=$BUILD_DATE

ENV DEBIAN_FRONTEND noninteractive
ENV PYTHONIOENCODING utf8

# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share

WORKDIR /build-ocrd
COPY setup.py .
COPY ocrd_tesserocr/ocrd-tool.json .
COPY README.md .
COPY requirements.txt .
COPY requirements_test.txt .
COPY ocrd_tesserocr ./ocrd_tesserocr
COPY Makefile .
RUN make deps-ubuntu && \
apt-get install -y --no-install-recommends \
g++ \
&& make deps install \
&& rm -rf /build-ocrd \
&& apt-get -y remove --auto-remove g++ libtesseract-dev make
RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata

WORKDIR /data
VOLUME /data
14 changes: 14 additions & 0 deletions slim-containers-files/ocrd_tesserocr/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
services:
ocrd-tesserocr-recognize:
build:
context: ../../ocrd_tesserocr
dockerfile: ../slim-containers-files/ocrd_tesserocr/Dockerfile
command:
ocrd network processing-worker ocrd-tesseroc-recognize --database $MONGODB_URL --queue $RABBITMQ_URL --create-queue
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ocrd network processing-worker ocrd-tesseroc-recognize --database $MONGODB_URL --queue $RABBITMQ_URL --create-queue
ocrd network processing-worker ocrd-tesserocr-recognize --database $MONGODB_URL --queue $RABBITMQ_URL --create-queue


ocrd-tesserocr-segment-region:
build:
context: ../../ocrd_tesserocr
dockerfile: ../slim-containers-files/ocrd_tesserocr/Dockerfile
command:
ocrd network processing-worker ocrd-tesserocr-segment-region --database $MONGODB_URL --queue $RABBITMQ_URL --create-queue
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should now use the ocrd-tesserocr-segment-region worker syntax instead to get instance caching, right?