diff --git a/.github/workflows/build-push-cpu.yml b/.github/workflows/build-push-cpu.yml new file mode 100644 index 00000000..628cc05a --- /dev/null +++ b/.github/workflows/build-push-cpu.yml @@ -0,0 +1,51 @@ +# +name: Create and publish CPU Docker image + +# Configures this workflow to run every time a change is pushed to the branch called `release`. +on: + push: + branches: ['release'] + workflow_dispatch: +# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. +jobs: + build-and-push-image: + runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. + permissions: + contents: read + packages: write + # + steps: + - name: Delete huge unnecessary tools folder + run: rm -rf /opt/hostedtoolcache + - name: Checkout repository + uses: actions/checkout@v4 + # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. + # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. + # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. + - name: Build and push cpu.Docker image + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + file: ./cpu.Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file diff --git a/.github/workflows/build-push-gpu.yml b/.github/workflows/build-push-gpu.yml new file mode 100644 index 00000000..c98cfb8e --- /dev/null +++ b/.github/workflows/build-push-gpu.yml @@ -0,0 +1,51 @@ +# +name: Create and publish GPU Docker image + +# Configures this workflow to run every time a change is pushed to the branch called `release`. +on: + push: + branches: ['release'] + workflow_dispatch: +# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. +jobs: + build-and-push-image: + runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. + permissions: + contents: read + packages: write + # + steps: + - name: Delete huge unnecessary tools folder + run: rm -rf /opt/hostedtoolcache + - name: Checkout repository + uses: actions/checkout@v4 + # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. + # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. + # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. + - name: Build and push gpu.Docker image + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + file: ./gpu.Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file diff --git a/README.md b/README.md index 67456b45..9b06f497 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,48 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. +# Quickstart with Docker + +The easiest way to get started with Marker is to build the Docker images. There are two options available: + +```bash +./build-docker-containers.sh --build + +## Prerequisites + +- Docker installed on your system. + + +## Running Marker with Docker + +### Convert a single file + +To convert a single PDF file to Markdown using Marker with Docker, run the following command: + +```bash +docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache speeddemonau/marker-gpu single /input/file.pdf /output/file.md [--parallel_factor N] [--max_pages N] +``` + +- Replace `/path/to/input` with the path to the directory containing your input PDF file. +- Replace `/path/to/output` with the path to the directory where you want the output Markdown file to be saved. +- Replace `/path/to/cache` with the path to a directory for caching. +- Adjust the `--parallel_factor` and `--max_pages` options as needed (see [Convert a single file](#convert-a-single-file) section for details). + +### Convert multiple files + +To convert multiple PDF files to Markdown using Marker with Docker, run the following command: + +```bash +docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache speeddemonau/marker-gpu multi /input /output [--workers N] [--max N] [--metadata_file FILE] [--min_length N] +``` + +- Replace `/path/to/input` with the path to the directory containing your input PDF files. +- Replace `/path/to/output` with the path to the directory where you want the output Markdown files to be saved. +- Replace `/path/to/cache` with the path to a directory for caching. +- Adjust the `--workers`, `--max`, `--metadata_file`, and `--min_length` options as needed (see [Convert multiple files](#convert-multiple-files) section for details). + +Make sure to use the appropriate Docker image tag (`speeddemonau/marker-cpu` or `speeddemonau/marker-gpu`) depending on whether you want to run Marker on CPU or GPU. + # Community [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. @@ -149,6 +191,12 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 bas Note that the env variables above are specific to this script, and cannot be set in `local.env`. +# Additional Notes + +- The Docker images are built with support for multiple languages. See the `TESSERACT_LANGUAGES` setting in `settings.py` for the list of supported languages or to add your own. +- The GPU image requires a NVIDIA GPU with CUDA support. Make sure you have the NVIDIA Docker runtime installed to use the GPU image. +- The cache directory mounted at `/app/.cache` inside the container is used to store cached data and models. This can help speed up subsequent runs. + # Benchmarks Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. diff --git a/build-docker-containers.sh b/build-docker-containers.sh new file mode 100644 index 00000000..bf70b335 --- /dev/null +++ b/build-docker-containers.sh @@ -0,0 +1,55 @@ +#!/bin/sh + +# Function to display help +show_help() { + echo "Usage: $0 [OPTION]" + echo "Options:" + echo " --build Build both GPU and CPU images." + echo " --build gpu Build only the GPU image." + echo " --build cpu Build only the CPU image." + echo " --help Display this help and exit." + echo "If no options are provided, both images are built." + echo "Example usage:" + echo " $0 --build gpu Builds only the GPU image." +} + +# Function to build images +build_images() { + if [ "$1" = "gpu" ] || [ -z "$1" ]; then + echo "Building marker-gpu" + docker build -f gpu.Dockerfile -t marker-gpu . + fi + if [ "$1" = "cpu" ] || [ -z "$1" ]; then + echo "Building marker-cpu" + docker build -f cpu.Dockerfile -t marker-cpu . + fi +} + +# Main script starts here +case $1 in + --build) + case $2 in + gpu|cpu) + build_images $2 + ;; + '') + build_images + ;; + *) + show_help + exit 1 + ;; + esac + ;; + --help) + show_help + ;; + '') + build_images + echo "Done" + ;; + *) + show_help + exit 1 + ;; +esac \ No newline at end of file diff --git a/cpu.Dockerfile b/cpu.Dockerfile new file mode 100644 index 00000000..6612f7c3 --- /dev/null +++ b/cpu.Dockerfile @@ -0,0 +1,69 @@ +FROM python:3.9 + +# Set the working directory +WORKDIR /app + +# set environment variables for poetry +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +ENV LANGUAGE=C.UTF-8 + +# Set environment variables for TORCH to use CPU +ENV TORCH_DEVICE=cpu + +# Install system requirements +RUN apt-get update && \ + apt-get install -y git curl wget unzip apt-transport-https \ + ghostscript lsb-release + +# Clone the marker repository +RUN git clone https://github.com/VikParuchuri/marker.git . + +# create a directory for the app and .cache +RUN mkdir -p /app/.cache + +# Set the cache directory +ENV CACHE_DIR=/app/.cache + + +# Install tesseract 5 (optional) +RUN echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null && \ + apt-get update -oAcquire::AllowInsecureRepositories=true && \ + apt-get install -y --allow-unauthenticated notesalexp-keyring && \ + apt-get update && \ + apt-get install -y --allow-unauthenticated tesseract-ocr libtesseract-dev \ + libmagic1 ocrmypdf tesseract-ocr-eng tesseract-ocr-deu \ + tesseract-ocr-por tesseract-ocr-spa tesseract-ocr-rus \ + tesseract-ocr-fra tesseract-ocr-chi-sim tesseract-ocr-jpn \ + tesseract-ocr-kor tesseract-ocr-hin + +RUN pip install --no-cache-dir --upgrade pip +RUN pip install --no-cache-dir --upgrade setuptools wheel +RUN pip install --no-cache-dir poetry + + +# Disable virtual env creation by poetry (not needed in Docker) +# and install dependencies based on the lock file without updating +RUN poetry config virtualenvs.create false \ + && poetry lock --no-update \ + && poetry install --no-dev # Exclude development dependencies + +RUN poetry remove torch + +RUN mkdir -p /app/static + +RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + + + +# Set the tesseract data folder path for Ubuntu 22.04 with tesseract 5 +ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata + +# Copy the entrypoint script +COPY entrypoint.sh /entrypoint.sh + +# Set the entrypoint +ENTRYPOINT ["/entrypoint.sh"] + +# Set the default command +CMD ["bash"] \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 00000000..a4b01646 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Check if the correct number of arguments is provided +if [ "$#" -lt 2 ]; then + echo "Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name [COMMAND] [ARGS]" + echo "" + echo "Commands:" + echo " single /input/file.pdf /output/file.md [OPTIONS]" + echo " Convert a single file" + echo " Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name single /input/file.pdf /output/file.md [--parallel_factor N] [--max_pages N]" + echo " Options:" + echo " --parallel_factor N Increase batch size and parallel OCR workers by N (default: 1)" + echo " --max_pages N Maximum number of pages to process (default: all)" + echo "" + echo " multi /input /output [OPTIONS]" + echo " Convert multiple files" + echo " Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name multi /input /output [--workers N] [--max N] [--metadata_file FILE] [--min_length N]" + echo " Options:" + echo " --workers N Number of PDFs to convert in parallel (default: 1)" + echo " --max N Maximum number of PDFs to convert (default: all)" + echo " --metadata_file FILE Path to JSON file with per-PDF metadata (default: none)" + echo " --min_length N Minimum number of characters to extract before processing (default: 0)" + exit 1 +fi + +# Get the command +COMMAND=$1 +shift + +# Activate the poetry shell +poetry shell + +# Run the specified command with the provided arguments +case $COMMAND in + single) + # Check if the correct number of arguments is provided + if [ "$#" -lt 2 ]; then + echo "Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name single /input/file.pdf /output/file.md [--parallel_factor N] [--max_pages N]" + exit 1 + fi + + # Set the input file and output file from the arguments + INPUT_FILE=$1 + OUTPUT_FILE=$2 + shift 2 + + # Run the convert_single.py script with the provided arguments + poetry run python /app/convert_single.py "$INPUT_FILE" "$OUTPUT_FILE" "$@" + ;; + + multi) + # Check if the correct number of arguments is provided + if [ "$#" -lt 2 ]; then + echo "Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name multi /input /output [--workers N] [--max N] [--metadata_file FILE] [--min_length N]" + exit 1 + fi + + # Set the input and output directories from the arguments + INPUT_DIR=$1 + OUTPUT_DIR=$2 + shift 2 + + # Run the convert.py script with the provided arguments + poetry run python /app/convert.py "$INPUT_DIR" "$OUTPUT_DIR" "$@" + ;; + + *) + echo "Unknown command: $COMMAND" + exit 1 + ;; +esac diff --git a/gpu.Dockerfile b/gpu.Dockerfile new file mode 100644 index 00000000..dc311f86 --- /dev/null +++ b/gpu.Dockerfile @@ -0,0 +1,74 @@ +FROM python:3.9 + + + +# Set the working directory +WORKDIR /app + +# set environment variables for poetry +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +ENV LANGUAGE=C.UTF-8 + +# Set the device to GPU +ENV TORCH_DEVICE=cuda + + + +# Install system requirements +RUN apt-get update && \ + apt-get install -y git curl wget unzip apt-transport-https \ + ghostscript lsb-release + +# Clone the marker repository +RUN git clone https://github.com/VikParuchuri/marker.git . + +# create a directory for the app and .cache +RUN mkdir -p /app/.cache + +# Set the cache directory +ENV CACHE_DIR=/app/.cache + +# Install tesseract 5 (optional) +RUN echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null && \ + apt-get update -oAcquire::AllowInsecureRepositories=true && \ + apt-get install -y --allow-unauthenticated notesalexp-keyring && \ + apt-get update && \ + apt-get install -y --allow-unauthenticated tesseract-ocr libtesseract-dev \ + libmagic1 ocrmypdf tesseract-ocr-eng tesseract-ocr-deu \ + tesseract-ocr-por tesseract-ocr-spa tesseract-ocr-rus \ + tesseract-ocr-fra tesseract-ocr-chi-sim tesseract-ocr-jpn \ + tesseract-ocr-kor tesseract-ocr-hin + +# Upgrade pip, setuptools, and wheel +RUN pip install --no-cache-dir --upgrade pip +RUN pip install --no-cache-dir --upgrade setuptools wheel + +# Install poetry +RUN pip install --no-cache-dir poetry + + +# Disable virtual env creation by poetry (not needed in Docker) +# and install dependencies based on the lock file without updating +RUN poetry config virtualenvs.create false \ + && poetry lock --no-update \ + && poetry install --no-dev # Exclude development dependencies + +RUN poetry remove torch + +RUN mkdir -p /app/static + +# Install torch with GPU support +RUN pip install torch torchvision torchaudio + +# Set the tesseract data folder path for Ubuntu 22.04 with tesseract 5 +ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata + +# Copy the entrypoint script +COPY entrypoint.sh /entrypoint.sh + +# Set the entrypoint +ENTRYPOINT ["/entrypoint.sh"] + +# Set the default command +CMD ["bash"] \ No newline at end of file