Skip to content

Commit

Permalink
Multiprocessing improvement, gunicorn worker support.
Browse files Browse the repository at this point in the history
  • Loading branch information
vladd-bit committed May 9, 2023
1 parent f13c7fa commit ed51967
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 18 deletions.
21 changes: 15 additions & 6 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,26 @@
# Any additional custom configuration flags that are not available via the tesseract function. For example: config='--psm 6'
TESSERACT_CUSTOM_CONFIG_FLAGS = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")

# controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
# this is derived normally from the amount of threads Gunicorn is running with, for example:
# Controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
# This is derived normally from the amount of threads Gunicorn is running with, for example:
# - if we have OCR_SERVICE_THREADS = 4, the OCR service can handle at most 4 requests at the same time,
# this means that you can not use all of your CPUS for OCR-ing for 1 request,
# because that means the other requests are sitting idle while the first one uses all resources,
# and so it is recommended to regulate the number of threads per request
OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", multiprocessing.cpu_count()))
OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1))

# This controls the number of workers the ocr service may have it is recommended to use this value
# instead of OCR_WEB_SERVICE_THREADS if you want to process multiple requests in parallel
# WARNING: using more than 1 workers assumes you have set the OCR_WEB_SERVICE_WORKER_CLASS setting to sync !
# with the above mentioned, setting OCR_WEB_SERVICE_WORKER_CLASS to sync means that a worker will use 1 THREAD only,
# therefore OCR_WEB_SERVICE_THREADS is disregarded
OCR_WEB_SERVICE_WORKERS = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1))

# set this to control the number of threads used for OCR-ing per web request thread (check OCR_WEB_SERVICE_THREADS)
CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_THREADS)))
CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))

# conversion thread number for the pdf -> PIL img conversion, per web request thread (check OCR_WEB_SERVICE_THREADS)
CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_THREADS)))
CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))

# should we convert detected images to greyscale before OCR-ing
OCR_CONVERT_GRAYSCALE_IMAGES = True
Expand All @@ -62,10 +69,12 @@
# a libre office server will only use 1 CPU by default (not changable), thus,
# for handling multiple requests, we will have one service per OCR_WEB_SERVICE_THREAD
DEFAULT_LIBRE_OFFICE_SERVER_PORT = 9900
LIBRE_OFFICE_LISTENER_PORT_RANGE = range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS)
LIBRE_OFFICE_LISTENER_PORT_RANGE = range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS + OCR_WEB_SERVICE_WORKERS)

LIBRE_OFFICE_NETWORK_INTERFACE = "localhost"


# seconds to check for possible failure of port
LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL = 10


Expand Down
23 changes: 17 additions & 6 deletions ocr_service/app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from config import *
from ocr_service.api import api
from ocr_service.processor.processor import Processor
from ocr_service.utils.utils import get_process_id_by_process_name
from ocr_service.utils.utils import is_port_in_use

sys.path.append("..")

Expand Down Expand Up @@ -48,12 +48,23 @@ def start_office_converter_servers():

port_count = 0
for port_num in LIBRE_OFFICE_LISTENER_PORT_RANGE:
if port_count < OCR_WEB_SERVICE_THREADS:
port_count += 1
if port_num not in list(loffice_processes.keys()):
loffice_processes[port_num] = start_office_server(port_num)
if OCR_WEB_SERVICE_WORKERS <= 1:
if port_count < OCR_WEB_SERVICE_THREADS:
port_count += 1
if port_num not in list(loffice_processes.keys()):
if is_port_in_use(port_num) == False:
loffice_processes[port_num] = start_office_server(port_num)
else:
break
else:
break
print("WOREKER TRYING PORT " + str(port_num))
if is_port_in_use(port_num) == False and port_count < OCR_WEB_SERVICE_WORKERS - 1:
loffice_processes[port_num] = start_office_server(port_num)
port_count += 1
else:
break


return loffice_processes

def create_app():
Expand Down
6 changes: 6 additions & 0 deletions ocr_service/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sys
import psutil
import logging
import socket

from sys import platform
from typing import List
Expand Down Expand Up @@ -95,3 +96,8 @@ def get_process_id_by_process_name(process_name: str = "") -> int:
break

return pid

def is_port_in_use(port: int) -> bool:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(('localhost', port)) == 0

12 changes: 6 additions & 6 deletions start_service_production.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ if [ -z ${OCR_SERVICE_PORT+x} ]; then
echo "OCR_SERVICE_PORT is unset -- setting to default: $OCR_SERVICE_PORT"
fi

if [ -z ${OCR_SERVICE_WORKERS+x} ]; then
OCR_SERVICE_WORKERS=1
echo "OCR_SERVICE_WORKERS is unset -- setting to default: $OCR_SERVICE_WORKERS"
if [ -z ${OCR_WEB_SERVICE_WORKERS+x} ]; then
OCR_WEB_SERVICE_WORKERS=4
echo "OCR_WEB_SERVICE_WORKERS is unset -- setting to default: $OCR_WEB_SERVICE_WORKERS"
fi

if [ -z ${OCR_WEB_SERVICE_THREADS+x} ]; then
OCR_WEB_SERVICE_THREADS=4
OCR_WEB_SERVICE_THREADS=1
echo "OCR_WEB_SERVICE_THREADS is unset -- setting to default: $OCR_WEB_SERVICE_THREADS"
fi

Expand All @@ -33,7 +33,7 @@ if [ -z ${OCR_SERVICE_LOG_LEVEL+x} ]; then
fi

if [ -z ${OCR_WEB_SERVICE_WORKER_CLASS+x} ]; then
OCR_WEB_SERVICE_WORKER_CLASS="gthread"
OCR_WEB_SERVICE_WORKER_CLASS="sync"
echo "OCR_WEB_SERVICE_WORKER_CLASS is unset -- setting to default: $OCR_WEB_SERVICE_WORKER_CLASS"
fi

Expand All @@ -44,6 +44,6 @@ OCR_SERVICE_ACCESS_LOG_FORMAT="%(t)s [ACCESSS] %(h)s \"%(r)s\" %(s)s \"%(f)s\" \
# start the OCR_SERVICE
#
echo "Starting up Flask app using gunicorn OCR_SERVICE ..."
python3.11 -m gunicorn --bind $OCR_SERVICE_HOST:$OCR_SERVICE_PORT -w $OCR_SERVICE_WORKERS --threads=$OCR_WEB_SERVICE_THREADS --timeout=$OCR_SERVICE_WORKER_TIMEOUT \
python3.11 -m gunicorn --bind $OCR_SERVICE_HOST:$OCR_SERVICE_PORT -w $OCR_WEB_SERVICE_WORKERS --threads=$OCR_WEB_SERVICE_THREADS --timeout=$OCR_SERVICE_WORKER_TIMEOUT \
--access-logformat="$OCR_SERVICE_ACCESS_LOG_FORMAT" --access-logfile=./ocr_service.log --log-file=./ocr_service.log --log-level error --worker-class=$OCR_WEB_SERVICE_WORKER_CLASS \
wsgi

0 comments on commit ed51967

Please sign in to comment.