Skip to content

Commit

Permalink
feat: info endpoint (#39)
Browse files Browse the repository at this point in the history
* feat: added app info endpoint #35

* feat: added libreoffice fonts #33

* fix: Warning failed to launch javaldx - java may not function correctly

* refactoring: added query parameter check and renamed pdfa parameter to profile

* chore: updated documentation
  • Loading branch information
rueedlinger authored Jun 8, 2024
1 parent 6697fc7 commit 58b48f2
Show file tree
Hide file tree
Showing 19 changed files with 595 additions and 144 deletions.
86 changes: 71 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,40 +5,97 @@ ARG USERNAME=worker
ARG USER_UID=1000
ARG USER_GID=$USER_UID
ARG VERSION

# supported tesseract languages https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
ARG TESSERACT_LANGUAGES="tesseract-ocr-deu tesseract-ocr-fra tesseract-ocr-ita tesseract-ocr-eng tesseract-ocr-por tesseract-ocr-spa"

LABEL org.opencontainers.image.title="teal" \
LABEL org.opencontainers.image.title="Teal" \
org.opencontainers.image.description="A convenient REST API for working with PDF's." \
org.opencontainers.image.documentation="https://teal.yax.ch/" \
org.opencontainers.image.source="https://github.com/rueedlinger/teal"

ENV DEBIAN_FRONTEND=noninteractive
WORKDIR /usr/src/app

# supported tesseract languages https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
############################
# install base packages
############################
RUN groupadd --gid $USER_GID $USERNAME &&\
useradd --uid $USER_UID --gid $USER_GID -m $USERNAME && \
wget -O /usr/local/bin/dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 &&\
chmod +x /usr/local/bin/dumb-init && \
echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \
apt-get update && \
apt-get install -y tesseract-ocr && \
apt-get install -y $TESSERACT_LANGUAGES && \
apt-get install -y poppler-utils && \
apt-get install -y ghostscript python3-tk && \
apt-get install -y libgl1 && \
apt-get install -y ocrmypdf && \
apt-get --no-install-recommends install -y -t bookworm-backports libreoffice && \
apt-get install -y default-jre-headless && \
apt-get update &&\
apt-get install -y tesseract-ocr \
$TESSERACT_LANGUAGES \
poppler-utils \
ghostscript \
python3-tk \
libgl1 \
ocrmypdf \
default-jre-headless &&\
apt-get --no-install-recommends install -y -qq -t bookworm-backports libreoffice && \
apt-get install -y -qq -t bookworm-backports libreoffice-java-common && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*


############################
# insatll verapdf
############################
COPY dist/auto-install.xml /tmp
RUN wget -O /tmp/verapdf-installer.zip https://software.verapdf.org/releases/verapdf-installer.zip && \
unzip -d /tmp /tmp/verapdf-installer.zip && \
/tmp/verapdf-greenfield-1.26.2/verapdf-install /tmp/auto-install.xml && \
rm -rf /tmp/verapdf* && \
rm -rf /tmp/auto-install.xml
rm -rf /tmp/*

############################
# install fonts (inspired by gotenberg)
# see https://github.com/gotenberg/gotenberg/blob/main/build/Dockerfile
############################
RUN wget -O /tmp/ttf-mscorefonts-installer_3.8.1_all.deb http://httpredir.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8.1_all.deb &&\
apt-get update &&\
apt-get install -y -qq --no-install-recommends \
/tmp/ttf-mscorefonts-installer_3.8.1_all.deb \
culmus \
fonts-beng \
fonts-hosny-amiri \
fonts-lklug-sinhala \
fonts-lohit-guru \
fonts-lohit-knda \
fonts-samyak-gujr \
fonts-samyak-mlym \
fonts-samyak-taml \
fonts-sarai \
fonts-sil-abyssinica \
fonts-sil-padauk \
fonts-telu \
fonts-thai-tlwg \
ttf-wqy-zenhei \
fonts-arphic-ukai \
fonts-arphic-uming \
fonts-ipafont-mincho \
fonts-ipafont-gothic \
fonts-unfonts-core \
# LibreOffice recommends.
fonts-crosextra-caladea \
fonts-crosextra-carlito \
fonts-dejavu \
fonts-dejavu-extra \
fonts-liberation \
fonts-liberation2 \
fonts-linuxlibertine \
fonts-noto-cjk \
fonts-noto-core \
fonts-noto-mono \
fonts-noto-ui-core \
fonts-sil-gentium \
fonts-sil-gentium-basic &&\
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

############################
# prepare app
############################
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt

Expand All @@ -53,7 +110,6 @@ ENV PATH="${PATH}:/usr/local/verapdf"

USER $USERNAME
ENV TEAL_VERSION="$VERSION"
ENV TESSERACT_TESSDATA_PATH="/usr/share/tesseract-ocr/5/tessdata"
# Runs "/usr/bin/dumb-init -- /my/script --with --args"
ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
CMD ["/usr/src/app/run.sh"]
51 changes: 44 additions & 7 deletions dist/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

if [ "$TEAL_TEST_MODE" = true ] ; then
echo "env TEAL_TEST_MODE ist set to '$TEAL_TEST_MODE'"
echo "running in test mode"
echo "running in TEST MODE"

if [ -z ${TEAL_PYTEST_ARGS+x} ]; then
export COVERAGE_FILE=/tmp/coverage
Expand All @@ -11,11 +11,10 @@ if [ "$TEAL_TEST_MODE" = true ] ; then
pytest $TEAL_PYTEST_ARGS
fi

#pytest --no-header -v --disable-warnings --log-cli-level debug
echo "shutting container down..."
exit
else
echo "running in app mode"
echo "running in APP MODE"
fi


Expand All @@ -27,6 +26,7 @@ else
if [ "$TEAL_WORKERS" -gt 1 ]; then
export PROMETHEUS_MULTIPROC_DIR="/tmp/prometheus"
echo "running in multi worker mode, creating PROMETHEUS_MULTIPROC_DIR $PROMETHEUS_MULTIPROC_DIR"
echo "CPU and MEM metrics not available in multi worker mode (TEAL_WORKERS > 1)"
mkdir $PROMETHEUS_MULTIPROC_DIR
fi
fi
Expand Down Expand Up @@ -54,17 +54,54 @@ else
fi

if [ "$TEAL_START_LOCUST" = true ] ; then
echo "env $TEAL_START_LOCUST ist set to '$TEAL_START_LOCUST'"
echo "env TEAL_START_LOCUST ist set to '$TEAL_START_LOCUST'"

echo "starting in locust"
if [ -z ${TEAL_LOCUST_PORT+x} ]; then
locust --host http://localhost:$TEAL_PORT --web-port 8089 -f tests/locustfile.py &
TEAL_LOCUST_PORT=8089
echo "TEAL_LOCUST_PORT is unset, will set to port $TEAL_LOCUST_PORT"
else
locust --host http://localhost:$TEAL_PORT --web-port $TEAL_LOCUST_PORT -f tests/locustfile.py &
echo "env $TEAL_LOCUST_PORT is set to '$$TEAL_LOCUST_PORT'"
fi

locust --host http://localhost:$TEAL_PORT --web-port $TEAL_LOCUST_PORT -f tests/locustfile.py &

fi

TEAL_TESSERACT_TESSDATA_PATH=$(find /usr/share -type d -name tessdata | head -n 1)
export TEAL_TESSERACT_TESSDATA_PATH

TEAL_LIBREOFFICE_VERSION=$(libreoffice --version | head -n 1)
export TEAL_LIBREOFFICE_VERSION

TEAL_OCRMYPDF_VERSION_VERSION=$(ocrmypdf --version | head -n 1)
export TEAL_OCRMYPDF_VERSION_VERSION

TEAL_VERAPDF_VERSION=$(verapdf --version | head -n 1)
export TEAL_VERAPDF_VERSION

# from poppler-utils
TEAL_PDFTOPPM_VERSION=$(echo "$(pdftoppm -v 2>&1 >/dev/null)" | head -n 1)
export TEAL_PDFTOPPM_VERSION

# from poppler-utils
TEAL_PDFTOCAIRO_VERSION=$(echo "$(pdftocairo -v 2>&1 >/dev/null)" | head -n 1)
export TEAL_PDFTOCAIRO_VERSION

TEAL_TESSERACT_VERSION=$(tesseract --version | head -n 1)
export TEAL_TESSERACT_VERSION

TEAL_PYTHON_VERSION=$(python --version | head -n 1)
export TEAL_PYTHON_VERSION

TEAL_JAVA_VERSION=$(java --version | head -n 1)
export TEAL_JAVA_VERSION

TEAL_ARCH_NAME=$(uname -m | head -n 1)
export TEAL_ARCH_NAME

TEAL_ARCH_NPROC=$(nproc)
export TEAL_ARCH_NPROC

echo "see API doc http://$TEAL_IP_BIND:$TEAL_PORT/docs"
gunicorn teal.api:app --workers "$TEAL_WORKERS" \
--worker-class uvicorn.workers.UvicornWorker --bind "$TEAL_IP_BIND":"$TEAL_PORT" \
Expand Down
Loading

0 comments on commit 58b48f2

Please sign in to comment.