Skip to content

Commit

Permalink
Merge pull request #13 from knaw-huc/main
Browse files Browse the repository at this point in the history
mergeback
  • Loading branch information
rvankoert authored Jul 22, 2024
2 parents 714b58b + e9c4ef2 commit b8d5d26
Show file tree
Hide file tree
Showing 11 changed files with 23 additions and 41 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ The Loghi framework is designed to streamline the process of Handwritten Text Re

### Laypa: Layout Analysis and Segmentation

[Laypa][https://github.com/knaw-huc/laypa/] specializes in the segmentation of documents, identifying different regions like paragraphs, page numbers, and most importantly, baselines within the text. Utilizing a sophisticated architecture based on a ResNet backbone and a feature pyramid network, Laypa performs pixel-wise classifications to detect these elements. Built on the [detectron2](https://github.com/facebookresearch/detectron2) framework, its output facilitates further processing by converting the classifications into instances—either as masks or directly into PageXML format. This segmentation is crucial for preparing documents for OCR/HTR processing, ensuring that text regions are accurately recognized and extracted.
[Laypa](https://github.com/knaw-huc/laypa/) specializes in the segmentation of documents, identifying different regions like paragraphs, page numbers, and most importantly, baselines within the text. Utilizing a sophisticated architecture based on a ResNet backbone and a feature pyramid network, Laypa performs pixel-wise classifications to detect these elements. Built on the [detectron2](https://github.com/facebookresearch/detectron2) framework, its output facilitates further processing by converting the classifications into instances—either as masks or directly into PageXML format. This segmentation is crucial for preparing documents for OCR/HTR processing, ensuring that text regions are accurately recognized and extracted.

### Loghi Tooling: Pre and Post-Processing Toolkit

Expand Down
5 changes: 3 additions & 2 deletions docker/buildAll.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash
VERSION=2.0.7
VERSION=2.1.1
set -e
set -o pipefail

CURRENT=$(pwd)

Expand Down Expand Up @@ -42,7 +43,7 @@ cd docker.base
cd ..
echo "building docker.loghi-tooling"
cd docker.loghi-tooling/
./buildImage.sh $BASE/prima-core-libs/ $BASE/loghi-tooling/
./buildImage.sh $BASE/prima-core-libs/ $BASE/loghi-tooling $VERSION
cd ..
echo "building docker.htr"
cd docker.htr
Expand Down
4 changes: 3 additions & 1 deletion docker/docker.base/buildAndInstallOpencv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ rm -rf $CURRENT/opencv_contrib

set -e

numcores=`nproc`

git clone https://github.com/opencv/opencv_contrib.git
git clone https://github.com/opencv/opencv.git
cd $CURRENT/opencv_contrib
Expand All @@ -19,7 +21,7 @@ cd $CURRENT/opencv/build
#cmake -D OPENCV_ENABLE_MEMALIGN=OFF -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local -D INSTALL_C_EXAMPLES=ON -D OPENCV_IO_ENABLE_JASPER=ON -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules -D WITH_TBB=ON ..
#cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local -D INSTALL_C_EXAMPLES=ON -D OPENCV_IO_ENABLE_JASPER=ON -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules ..
cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local -D INSTALL_C_EXAMPLES=ON -D OPENCV_IO_ENABLE_JASPER=ON -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules -D WITH_TBB=ON ..
make -j 24
make -j $numcores
sudo make install
sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/opencv.conf'
sudo ldconfig
Expand Down
36 changes: 7 additions & 29 deletions docker/docker.loghi-tooling/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,25 @@ FROM docker.base
EXPOSE 9006

RUN useradd -u 1000 rutger
#RUN useradd -m builder

RUN apt-get update && \
apt-get install -y locales && rm -rf /var/lib/apt
# apt-get install -y openjdk-11-jre maven postgresql-client locales

RUN locale-gen en_US.UTF-8
ARG LOGHI_VERSION=1.0-SNAPSHOT

ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
ENV LOGHI_VERSION=${LOGHI_VERSION}

COPY loghi-tooling /src/loghi-tooling
COPY prima-core-libs /src/prima-core-libs
COPY dependency-check-data /root/.m2/repository/org/owasp/dependency-check-data

WORKDIR /src/prima-core-libs/java
#USER builder

RUN apt-get update && apt-get --no-install-recommends install -y openjdk-11-jre maven postgresql-client libdc1394-22 libavcodec58 libavformat58 libswscale5 libtbb2 git \
&& mvn deploy:deploy-file -Durl=file:///$HOME/repo -Dfile=/usr/local/share/java/opencv4/opencv-490.jar -DgroupId=org.opencv -DartifactId=opencv -Dpackaging=jar -Dversion=4.9.0 \
&& mvn clean package
Expand All @@ -29,38 +32,13 @@ RUN cd /src \
&& mvn deploy:deploy-file -Durl=file:///$HOME/repo -Dfile=target/langident-1.0.5-SNAPSHOT.jar -DgroupId=nl.knaw.huygens.pergamon.nlp -DartifactId=langident -Dpackaging=jar -Dversion=1.0.5

# && mvn clean package \
# && mvn org.owasp:dependency-check-maven:check versions:set -DnewVersion=$LOGHI_VERSION clean package \
RUN cd /src/loghi-tooling \
&& mvn clean org.owasp:dependency-check-maven:check package \
&& mvn versions:set -DnewVersion=$LOGHI_VERSION clean package \
&& find . -name src | xargs rm -rf \
&& find . -name test-classes | xargs rm -rf \
&& rm -rf /src/loghi-tooling/layoutanalyzer/target \
&& apt-get remove -y maven && apt autoremove -y \
&& rm -rf /var/lib/apt \
&& rm -rf /home/root \
&& rm -rf /root


#WORKDIR /src
#RUN wget https://dl.min.io/client/mc/release/linux-amd64/mcli_20221029100923.0.0_amd64.deb && dpkg -i mcli_20221029100923.0.0_amd64.deb && rm mcli_20221029100923.0.0_amd64.deb
#RUN mcli alias set myminio/ http://MINIO-SERVER MYUSER MYPASSWORD


#FROM docker.base
#RUN useradd -u 1000 rutger
#RUN apt-get update \
# && apt-get install -y --no-install-recommends openjdk-11-jre libtbb2 \
# && apt autoremove -y
#
#COPY --from=0 /src/loghi-tooling /src/loghi-tooling
#
#USER root

#USER rutger
# && apt remove -y git autotools-dev dpkg-dev icu-devtools libaec-dev libavutil-dev libblkid-dev libc-dev-bin libc6-dev libcrypt-dev libdatrie-dev \
# libegl-dev libexif-dev libexpat1-dev libffi-dev libfreetype6-dev libfribidi-dev libgcc-9-dev libgl-dev libglib2.0-dev-bin libglu1-mesa-dev libglx-dev \
# libgraphite2-dev libice-dev libicu-dev libjbig-dev libjpeg-turbo8-dev liblzma-dev libmount-dev libogg-dev libopenblas-pthread-dev libpcre2-dev libpcre3-dev \
# libpixman-1-dev libpthread-stubs0-dev libqt5opengl5-dev libraw1394-dev libselinux1-dev libsepol1-dev libsm-dev libstdc++-9-dev libswresample-dev libthai-dev \
# libvulkan-dev libx11-dev libxau-dev libxcb-render0-dev libxcb-shm0-dev libxcb1-dev libxcomposite-dev libxcursor-dev libxdamage-dev libxdmcp-dev libxext-dev \
# libxfixes-dev libxi-dev libxinerama-dev libxrandr-dev libxrender-dev libxt-dev linux-libc-dev manpages-dev qtbase5-dev qtbase5-dev-tools uuid-dev \
# x11proto-core-dev x11proto-dev x11proto-input-dev x11proto-randr-dev x11proto-xext-dev x11proto-xinerama-dev xtrans-dev zlib1g-dev libllvm10 libllvm12 \
# libx265-179 perl-modules-5.30 libperl5.30 humanity-icon-theme \
5 changes: 3 additions & 2 deletions docker/docker.loghi-tooling/buildImage.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ set -e

if [ -z $1 ]; then echo "first parameter should be the path of prima-core-libs" && exit 1; fi;
if [ -z $2 ]; then echo "second parameter should be the path of loghi-tooling" && exit 1; fi;
if [ -z $3 ]; then echo "third parameter should be version which loghi-tooling will get" && exit 1; fi;

PRIMACORELIBS="$(realpath $1)"
LOGHITOOLING="$(realpath $2)"
LOGHI_VERSION=$3

echo "Change to directory of script..."
DIR_OF_SCRIPT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
Expand All @@ -30,8 +32,7 @@ rm -rf ./loghi-tooling/layoutanalyzer/src/test/resources/in/*.png

echo "Building docker image..."

docker build --no-cache . -t loghi/docker.loghi-tooling

docker build --no-cache -t loghi/docker.loghi-tooling --build-arg LOGHI_VERSION=$LOGHI_VERSION .
echo "cleaning up!"
rm -rf prima-core-libs
rm -rf loghi-tooling
Expand Down
2 changes: 1 addition & 1 deletion scripts/create-train-data.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
VERSION=2.0.7
VERSION=2.1.1

# User-configurable parameters
# Percentage split for training and validation sets
Expand Down
2 changes: 1 addition & 1 deletion scripts/htr-train-pipeline.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
VERSION=2.0.7
VERSION=2.1.1
set -e

# User-configurable parameters
Expand Down
2 changes: 1 addition & 1 deletion scripts/inference-pipeline.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
VERSION=2.0.7
VERSION=2.1.1
set -e

# User-configurable parameters
Expand Down

0 comments on commit b8d5d26

Please sign in to comment.