diff --git a/Dockerfile b/Dockerfile index 8c9d83d62..d1f2c1069 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,78 +1,71 @@ -########################################### -# Build PyMuPDF - -FROM alpine:latest as pymupdf-build -ARG ARCH -ARG REQUIREMENTS_TXT - -# Install PyMuPDF via hash-checked requirements file -COPY ${REQUIREMENTS_TXT} /tmp/requirements.txt - -# PyMuPDF provides non-arm musl wheels only. -# Only install build-dependencies if we are actually building the wheel -RUN case "$ARCH" in \ - "arm64") \ - # This is required for copying later, but is created only in the pre-built wheels - mkdir -p /usr/lib/python3.12/site-packages/PyMuPDF.libs/ \ - && apk --no-cache add linux-headers g++ linux-headers gcc make python3-dev py3-pip clang-dev ;; \ - *) \ - apk --no-cache add py3-pip ;; \ - esac -RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt - - -########################################### -# Download H2ORestart -FROM alpine:latest as h2orestart-dl -ARG H2ORESTART_CHECKSUM=d09bc5c93fe2483a7e4a57985d2a8d0e4efae2efb04375fe4b59a68afd7241e2 +# NOTE: Updating the packages to their latest versions requires bumping the +# Dockerfile args below. For more info about this file, read +# docs/developer/reproducibility.md. + +ARG DEBIAN_IMAGE_DATE=20250113 + +FROM debian:bookworm-${DEBIAN_IMAGE_DATE}-slim + +ARG GVISOR_ARCHIVE_DATE=20250106 +ARG DEBIAN_ARCHIVE_DATE=20250114 +ARG H2ORESTART_CHECKSUM=8a5be77359695c14faaf33891d3eca6c9d73c1224599aab50a9d2ccc04640580 +ARG H2ORESTART_VERSION=v0.6.8 + +ENV DEBIAN_FRONTEND=noninteractive + +# The following way of installing packages is taken from +# https://github.com/reproducible-containers/repro-sources-list.sh/blob/master/Dockerfile.debian-12, +# and adapted to allow installing gVisor from each own repo as well. +RUN \ + --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + --mount=type=bind,source=./container/repro-sources-list.sh,target=/usr/local/bin/repro-sources-list.sh \ + --mount=type=bind,source=./container/gvisor.key,target=/tmp/gvisor.key \ + : "Hacky way to set a date for the Debian snapshot repos" && \ + touch -d ${DEBIAN_ARCHIVE_DATE} /etc/apt/sources.list.d/debian.sources && \ + touch -d ${DEBIAN_ARCHIVE_DATE} /etc/apt/sources.list && \ + repro-sources-list.sh && \ + : "Setup APT to install gVisor from its separate APT repo" && \ + apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg && \ + gpg -o /usr/share/keyrings/gvisor-archive-keyring.gpg --dearmor /tmp/gvisor.key && \ + echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/gvisor-archive-keyring.gpg] https://storage.googleapis.com/gvisor/releases ${GVISOR_ARCHIVE_DATE} main" > /etc/apt/sources.list.d/gvisor.list && \ + : "Install the necessary gVisor and Dangerzone dependencies" && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + python3 python3-fitz libreoffice-nogui libreoffice-java-common \ + python3 python3-magic default-jre-headless fonts-noto-cjk fonts-dejavu \ + runsc unzip wget && \ + : "Clean up for improving reproducibility (optional)" && \ + rm -rf /var/cache/fontconfig/ && \ + rm -rf /etc/ssl/certs/java/cacerts && \ + rm -rf /var/log/* /var/cache/ldconfig/aux-cache + +# Download H2ORestart from GitHub using a pinned version and hash. Note that +# it's available in Debian repos, but not in Bookworm yet. RUN mkdir /libreoffice_ext && cd libreoffice_ext \ && H2ORESTART_FILENAME=h2orestart.oxt \ - && H2ORESTART_VERSION="v0.6.6" \ && wget https://github.com/ebandal/H2Orestart/releases/download/$H2ORESTART_VERSION/$H2ORESTART_FILENAME \ && echo "$H2ORESTART_CHECKSUM $H2ORESTART_FILENAME" | sha256sum -c \ - && install -dm777 "/usr/lib/libreoffice/share/extensions/" - - -########################################### -# Dangerzone image - -FROM alpine:latest - -# Install dependencies -RUN apk --no-cache -U upgrade && \ - apk --no-cache add \ - libreoffice \ - openjdk8 \ - python3 \ - py3-magic \ - font-noto-cjk - -COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz -COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf -COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs -COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext - -RUN install -dm777 "/usr/lib/libreoffice/share/extensions/" - -RUN mkdir -p /opt/dangerzone/dangerzone -RUN touch /opt/dangerzone/dangerzone/__init__.py -COPY conversion /opt/dangerzone/dangerzone/conversion - -# Add the unprivileged user. Set the UID/GID of the dangerzone user/group to -# 1000, since we will point to it from the OCI config. -# -# NOTE: A tmpfs will be mounted over /home/dangerzone directory, -# so nothing within it from the image will be persisted. -RUN addgroup -g 1000 dangerzone && \ - adduser -u 1000 -s /bin/true -G dangerzone -h /home/dangerzone -D dangerzone - -RUN GVISOR_URL="https://storage.googleapis.com/gvisor/releases/release/latest/$(uname -m)"; \ - wget "${GVISOR_URL}/runsc" "${GVISOR_URL}/runsc.sha512" && \ - sha512sum -c runsc.sha512 && \ - rm -f runsc.sha512 && \ - chmod 555 runsc && \ - mv runsc /usr/bin/ - + && install -dm777 "/usr/lib/libreoffice/share/extensions/" \ + && rm /root/.wget-hsts + +# Create an unprivileged user both for gVisor and for running Dangerzone. +RUN mkdir -p /opt/dangerzone/dangerzone && \ + touch /opt/dangerzone/dangerzone/__init__.py && \ + addgroup --gid 1000 dangerzone && \ + adduser --uid 1000 --ingroup dangerzone --shell /bin/true \ + --disabled-password --home /home/dangerzone dangerzone + +COPY conversion/doc_to_pixels.py \ + conversion/common.py \ + conversion/errors.py \ + conversion/__init__.py \ + /opt/dangerzone/dangerzone/conversion + +# Let the entrypoint script write the OCI config for the inner container under +# /config.json. RUN touch /config.json RUN chown dangerzone:dangerzone /config.json diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index 673760701..a7a777636 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -129,6 +129,8 @@ async def convert(self) -> None: # At least .odt, .docx, .odg, .odp, .ods, and .pptx "application/zip": { "type": "libreoffice", + # NOTE: Older `file` command cannot detect hwpx files properly. + "libreoffice_ext": "h2orestart.oxt", }, # At least .doc, .docx, .odg, .odp, .odt, .pdf, .ppt, .pptx, .xls, and .xlsx "application/octet-stream": {