Add Dockerfile. Update installation scripts to work without sudo if r…

…un as root
VikParuchuri · Dec 1, 2023 · d6235f8 · d6235f8
1 parent 8954b84
commit d6235f8
Show file tree

Hide file tree

Showing 11 changed files with 152 additions and 14 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,3 @@
+.DS_Store
+.git
+.vscode
diff --git a/.env.sample b/.env.sample
@@ -0,0 +1,27 @@
+# See marker/settings.py for more options
+# The following are the default values. Uncomment and change as needed.
+
+# Please note the order of precedence for settings:
+# 1. Environment variables
+# 2. local.env file
+# 3. Default values in marker/settings.py
+
+# See # https://docs.pydantic.dev/latest/concepts/pydantic_settings/#dotenv-env-support
+
+# TESSDATA_PREFIX setting is set in the Dockerfile
+
+## General settings:
+
+# TORCH_DEVICE=cpu
+
+# How much VRAM each GPU has (in GB).
+# INFERENCE_RAM=12
+
+# How much VRAM to allocate per task (in GB).  Peak marker VRAM usage is around 3GB, but avg across workers is lower.
+# VRAM_PER_TASK=2.5
+
+# Enable debug logging
+# DEBUG=False
+
+# Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
+# DEFAULT_LANG=English
diff --git a/.gitignore b/.gitignore
@@ -166,3 +166,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+.vscode
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,34 @@
+ARG IMAGE_TAG=2.1.0-cuda11.8-cudnn8-runtime
+
+FROM pytorch/pytorch:${IMAGE_TAG}
+
+VOLUME /root/.cache
+
+WORKDIR /app
+
+COPY ./scripts /app/scripts
+
+ENV DEBIAN_FRONTEND noninteractive
+RUN apt-get update \
+  && apt-get install apt-transport-https software-properties-common lsb-release -y \
+  && add-apt-repository ppa:alex-p/tesseract-ocr-devel \
+  && scripts/install/ghostscript_install.sh \
+  && apt-get install -y $(cat scripts/install/apt-requirements.txt) \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY ./pyproject.toml ./poetry.lock ./
+
+RUN pip install pip==23.3.1 \
+  && pip install poetry==1.5.0 \
+  && poetry config virtualenvs.create false \
+  && poetry install --no-dev --no-interaction --no-ansi --no-root
+
+COPY . .
+
+
+ARG TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
+ENV TESSDATA_PREFIX=${TESSDATA_PREFIX}
+
+# Test to make sure the TESSDATA_PREFIX is set correctly
+RUN find / -name tessdata 2> /dev/null | grep "${TESSDATA_PREFIX}"
+
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ Marker is a pipeline of deep learning models:
 - Clean and format each block (heuristics, [nougat](https://huggingface.co/facebook/nougat-base))
 - Combine blocks and postprocess complete text (heuristics, [pdf_postprocessor](https://huggingface.co/vikp/pdf_postprocessor_t5))
 
-Relying on autoregressive forward passes to generate text is slow and prone to hallucination/repetition.  From the nougat paper: `We observed [repetition] in 1.5% of pages in the test set, but the frequency increases for out-of-domain documents.`  In my anecdotal testing, repetitions happen on 5%+ of out-of-domain (non-arXiv) pages.  
+Relying on autoregressive forward passes to generate text is slow and prone to hallucination/repetition.  From the nougat paper: `We observed [repetition] in 1.5% of pages in the test set, but the frequency increases for out-of-domain documents.`  In my anecdotal testing, repetitions happen on 5%+ of out-of-domain (non-arXiv) pages.
 
 Nougat is an amazing model, but I wanted a faster and more general purpose solution. Marker is 10x faster and has low hallucination risk because it only passes equation blocks through an LLM forward pass.
 
@@ -85,6 +85,40 @@ First, clone the repo:
   - `poetry install`
   - `poetry shell` to activate your poetry venv
 
+## Docker (nvidia)
+
+
+### Requirements
+
+* Linux Host (WSL might work. See [here](https://www.docker.com/blog/wsl-2-gpu-support-for-docker-desktop-on-nvidia-gpus/).)
+* [Docker](https://docs.docker.com/engine/install/)
+* [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+
+
+### Convert single file
+
+See https://github.com/VikParuchuri/marker#convert-a-single-file
+
+#### docker compose
+```bash
+mkdir -p {input,output}
+# Copy source PDF files into ./input directory
+# wget -P input https://greenteapress.com/thinkpython/thinkpython.pdf
+docker compose run marker
+```
+
+#### docker run
+```bash
+mkdir -p {input,output}
+# wget -P input https://greenteapress.com/thinkpython/thinkpython.pdf
+
+# Copy source PDF files into ./input directory
+# Change shm-size to be the size of your VRAM if possible
+
+docker build -t marker .
+docker run --shm-size 12gb -v $PWD/input:/input -v $PWD/output:/output --gpus=all -it marker python convert_single.py /input/thinkpython.pdf /output/thinkpython.md --parallel_factor 2 --max_pages 10
+```
+
 # Usage
 
 First, some configuration:
@@ -191,7 +225,7 @@ Omit `--nougat` to exclude nougat from the benchmark.  I don't recommend running
 
 # Commercial usage
 
-Due to the licensing of the underlying models like layoutlmv3 and nougat, this is only suitable for noncommercial usage.  
+Due to the licensing of the underlying models like layoutlmv3 and nougat, this is only suitable for noncommercial usage.
 
 I'm building a version that can be used commercially, by stripping out the dependencies below. If you would like to get early access, email me at [email protected].
 

diff --git a/chunk_convert.sh b/chunk_convert.sh
diff --git a/data/latex_to_md.sh b/data/latex_to_md.sh
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,26 @@
+services:
+  marker:
+    build: .
+    image: marker
+    # command: python convert.py /input /output --workers 10 --max 10 --min_length 10000
+    command: python convert_single.py /input/thinkpython.pdf /output/thinkpython.md --parallel_factor 2 --max_pages 10
+    shm_size: '12gb' # set this to the size of VRAM if possible
+    volumes:
+      - ./input:/input
+      - ./output:/output
+      - xdg_cache:/root/.cache
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TORCH_DEVICE=cuda
+      - INFERENCE_RAM=12
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+volumes:
+  xdg_cache:
diff --git a/scripts/install/ghostscript_install.sh b/scripts/install/ghostscript_install.sh
@@ -1,10 +1,18 @@
 #!/bin/bash
 
-wget https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs10012/ghostscript-10.01.2.tar.gz
-tar -xvf ghostscript-10.01.2.tar.gz
-cd ghostscript-10.01.2
+GS_VERSION="10.01.2"
+
+wget https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs10012/ghostscript-$GS_VERSION.tar.gz
+tar -xvf ghostscript-$GS_VERSION.tar.gz
+cd ghostscript-$GS_VERSION
 ./configure
-sudo make install
+
+if [[ $EUID -ne 0 ]]; then
+    sudo make install
+else
+    make install
+fi
+
 cd ..
-sudo rm -rf ghostscript-10.01.2
-rm ghostscript-10.01.2.tar.gz
+rm -rf ghostscript-$GS_VERSION
+rm -f ghostscript-$GS_VERSION.tar.gz
diff --git a/scripts/install/tesseract_5_install.sh b/scripts/install/tesseract_5_install.sh
@@ -1,9 +1,14 @@
 #!/bin/bash
 
-sudo apt-get install apt-transport-https
+## Check if the script is running as root
+if [[ $EUID -ne 0 ]]; then
+    exec sudo /bin/bash "$0" "$@"
+fi
+
+apt-get install apt-transport-https lsb-release -y
 echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" \
-| sudo tee /etc/apt/sources.list.d/notesalexp.list > /dev/null
-sudo apt-get update -oAcquire::AllowInsecureRepositories=true
-sudo apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true
-sudo apt-get update
-sudo apt-get install tesseract-ocr
+| tee /etc/apt/sources.list.d/notesalexp.list > /dev/null
+apt-get update -oAcquire::AllowInsecureRepositories=true
+apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y
+apt-get update
+apt-get install tesseract-ocr -y
diff --git a/scripts/markdown_to_pdf.sh b/scripts/markdown_to_pdf.sh
-Original file line number
+Diff line change
@@ Expand Up / @@ -166,3 +166,4 @@ cython_debug/ @@
     #  and can be added to the global gitignore or merged into this file.  For a more nuclear
     #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
     .idea/
+    .vscode