diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 09d7fc632..0eb8805ee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -66,8 +66,32 @@ jobs: sudo apt-get install -y python3-poetry python3 ./install/common/build-image.py + download-tessdata: + name: Download and cache Tesseract data + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Cache Tessdata + id: cache-tessdata + uses: actions/cache@v4 + with: + path: share/tessdata/ + key: v1-tessdata-${{ hashFiles('./install/common/download-tessdata.py') }} + enableCrossOsArchive: true + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Download Tessdata + run: |- + if [ -f "share/tessdata" ]; then + echo "Already cached, skipping" + else + python3 ./install/common/download-tessdata.py + fi + windows: runs-on: windows-latest + needs: download-tessdata env: DUMMY_CONVERSION: 1 steps: @@ -77,6 +101,13 @@ jobs: python-version: "3.12" - run: pip install poetry - run: poetry install + - name: Restore cached tessdata + uses: actions/cache/restore@v4 + with: + path: share/tessdata/ + enableCrossOsArchive: true + fail-on-cache-miss: true + key: v1-tessdata-${{ hashFiles('./install/common/download-tessdata.py') }} - name: Run CLI tests run: poetry run make test # Taken from: https://github.com/orgs/community/discussions/27149#discussioncomment-3254829 @@ -90,6 +121,7 @@ jobs: macOS: name: "macOS (${{ matrix.arch }})" runs-on: ${{ matrix.runner }} + needs: download-tessdata strategy: matrix: include: @@ -104,6 +136,13 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + - name: Restore cached tessdata + uses: actions/cache/restore@v4 + with: + path: share/tessdata/ + enableCrossOsArchive: true + fail-on-cache-miss: true + key: v1-tessdata-${{ hashFiles('./install/common/download-tessdata.py') }} - run: pip install poetry - run: poetry install - name: Run CLI tests @@ -174,7 +213,7 @@ jobs: if: matrix.distro == 'debian' && matrix.version == 'bookworm' uses: actions/upload-artifact@v4 with: - name: dangerzone.deb + name: dangerzone-${{ matrix.distro }}-${{ matrix.version }}.deb path: "deb_dist/dangerzone_*_*.deb" if-no-files-found: error compression-level: 0 @@ -214,7 +253,7 @@ jobs: - name: Download Dangerzone .deb uses: actions/download-artifact@v4 with: - name: dangerzone.deb + name: dangerzone-debian-bookworm.deb path: "deb_dist/" - name: Build end-user environment @@ -227,7 +266,7 @@ jobs: run: | ./dev_scripts/env.py --distro ${{ matrix.distro }} \ --version ${{ matrix.version }} \ - run dangerzone-cli dangerzone/tests/test_docs/sample-pdf.pdf + run dangerzone-cli dangerzone/tests/test_docs/sample-pdf.pdf --ocr-lang eng - name: Check that the Dangerzone GUI imports work run: | @@ -291,7 +330,7 @@ jobs: - name: Run a test command run: | ./dev_scripts/env.py --distro ${{ matrix.distro }} --version ${{ matrix.version }} \ - run dangerzone-cli dangerzone/tests/test_docs/sample-pdf.pdf + run dangerzone-cli dangerzone/tests/test_docs/sample-pdf.pdf --ocr-lang eng - name: Check that the Dangerzone GUI imports work run: | @@ -301,7 +340,9 @@ jobs: run-tests: name: "run tests (${{ matrix.distro }} ${{ matrix.version }})" runs-on: ubuntu-latest - needs: build-container-image + needs: + - build-container-image + - download-tessdata strategy: matrix: include: @@ -360,6 +401,22 @@ jobs: share/image-id.txt fail-on-cache-miss: true + - name: Restore cached tessdata + uses: actions/cache/restore@v4 + with: + path: share/tessdata/ + enableCrossOsArchive: true + fail-on-cache-miss: true + key: v1-tessdata-${{ hashFiles('./install/common/download-tessdata.py') }} + + - name: Setup xvfb (Linux) + run: | + # Stuff copied wildly from several stackoverflow posts + sudo apt-get install -y xvfb libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 libxcb-render-util0 libxcb-xinerama0 libxcb-xinput0 libxcb-xfixes0 libxcb-shape0 libglib2.0-0 libgl1-mesa-dev '^libxcb.*-dev' libx11-xcb-dev libglu1-mesa-dev libxrender-dev libxi-dev libxkbcommon-dev libxkbcommon-x11-dev + + # start xvfb in the background + sudo /usr/bin/Xvfb $DISPLAY -screen 0 1280x1024x24 & + - name: Run CI tests run: |- # Pass the -ac Xserver flag, to disable host-based access controls. diff --git a/.gitignore b/.gitignore index 5bcf49785..f45a78bae 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ +share/tessdata/ *.egg-info/ .installed.cfg *.egg diff --git a/BUILD.md b/BUILD.md index f7e63eb94..c395f504a 100644 --- a/BUILD.md +++ b/BUILD.md @@ -97,6 +97,12 @@ Build the latest container: python3 ./install/common/build-image.py ``` +Download the OCR language data: + +```sh +python3 ./install/common/download-tessdata.py +``` + Run from source tree: ```sh @@ -174,6 +180,12 @@ Build the latest container: python3 ./install/common/build-image.py ``` +Download the OCR language data: + +```sh +python3 ./install/common/download-tessdata.py +``` + Run from source tree: ```sh @@ -278,10 +290,7 @@ test it. cd dangerzone ``` -2. Follow the Fedora instructions for setting up the development environment with the particularity of running the following instead of `poetry install`: - ``` - poetry install --with qubes - ``` +2. Follow the Fedora instructions for setting up the development environment. 3. Build a dangerzone `.rpm` for qubes with the command @@ -379,6 +388,12 @@ Build the dangerzone container image: python3 ./install/common/build-image.py ``` +Download the OCR language data: + +```sh +python3 ./install/common/download-tessdata.py +``` + Run from source tree: ```sh @@ -440,6 +455,12 @@ Build the dangerzone container image: python3 .\install\common\build-image.py ``` +Download the OCR language data: + +```sh +python3 .\install\common\download-tessdata.py +``` + After that you can launch dangerzone during development with: ``` diff --git a/Dockerfile b/Dockerfile index 3b5c565a7..83896ebec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,30 +21,6 @@ RUN case "$ARCH" in \ RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt -########################################### -# Download Tesseract data - -FROM alpine:latest as tessdata-dl -ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9 - -# Download the trained models from the latest GitHub release of Tesseract, and -# store them under /usr/share/tessdata. This is basically what distro packages -# do under the hood. -# -# Because the GitHub release contains more files than just the trained models, -# we use `find` to fetch only the '*.traineddata' files in the top directory. -# -# Before we untar the models, we also check if the checksum is the expected one. -RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \ - && TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \ - | sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \ - && wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \ - && echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \ - && tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \ - && find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \ - && cd .. && rm -r tessdata - - ########################################### # Download H2ORestart FROM alpine:latest as h2orestart-dl @@ -74,7 +50,6 @@ RUN apk --no-cache -U upgrade && \ COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs -COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext RUN install -dm777 "/usr/lib/libreoffice/share/extensions/" diff --git a/Makefile b/Makefile index f32369a7a..43c51b255 100644 --- a/Makefile +++ b/Makefile @@ -45,7 +45,7 @@ test: # Make each GUI test run as a separate process, to avoid segfaults due to # shared state. # See more in https://github.com/freedomofpress/dangerzone/issues/493 - pytest --co -q tests/gui | grep -v ' collected' | xargs -n 1 pytest -v + pytest --co -q tests/gui | grep -e '^tests/' | xargs -n 1 pytest -v pytest -v --cov --ignore dev_scripts --ignore tests/gui --ignore tests/test_large_set.py diff --git a/README.md b/README.md index 478091e4a..137f0c4a7 100644 --- a/README.md +++ b/README.md @@ -92,19 +92,3 @@ Dangerzone gets updates to improve its features _and_ to fix problems. So, updat 1. Check which version of Dangerzone you are currently using: run Dangerzone, then look for a series of numbers to the right of the logo within the app. The format of the numbers will look similar to `0.4.1` 2. Now find the latest available version of Dangerzone: go to the [download page](https://dangerzone.rocks/#downloads). Look for the version number displayed. The number will be using the same format as in Step 1. 3. Is the version on the Dangerzone download page higher than the version of your installed app? Go ahead and update. - -### "I get `invalid json returned from container` on MacOS Big Sur or newer (MacOS 11.x.x or higher)" - -Are you using the latest version of Dangerzone? See the FAQ for: "I'm experiencing an issue while using Dangerzone." - -You _may_ be attempting to convert a file in a directory to which Docker Desktop does not have access. Dangerzone for Mac requires Docker Desktop for conversion. Docker Desktop, in turn, requires permission from MacOS to access the directory in which your target file is located. - -To grant this permission: - -1. On MacOS 13, choose Apple menu > System Settings. On lower versions, choose System Preferences. -2. Tap into Privacy & Security in the sidebar. (You may need to scroll down.) -3. In the Privacy section, tap into Files & Folders. (Again, you may need to scroll down.) -4. Scroll to the entry for Docker. Tap the > to expand the entry. -5. Enable the toggle beside the directory where your file is present. For example, if the file to be converted is in the Downloads folder, enable the toggle beside Downloads. - -(Full Disk Access permission has a similar effect, but it's enough to give Docker access to _only_ the directory containing the intended file(s) to be converted. Full Disk is unnecessary. As of 2023.04.28, granting one of these permissions continues to be required for successful conversion. Apologies for the extra steps. Dangerzone depends on Docker, and the fix for this issue needs to come from upstream. Read more on [#371](https://github.com/freedomofpress/dangerzone/issues/371#issuecomment-1516863056).) diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 882b1ec27..0940a5661 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -13,15 +13,6 @@ def running_on_qubes() -> bool: return os.path.exists("/usr/share/qubes/marker-vm") -def get_tessdata_dir() -> str: - if os.environ.get("TESSDATA_PREFIX"): - return os.environ["TESSDATA_PREFIX"] - elif running_on_qubes(): - return "/usr/share/tesseract/tessdata/" - else: - return "/usr/share/tessdata/" - - class DangerzoneConverter: def __init__(self, progress_callback: Optional[Callable] = None) -> None: self.percentage: float = 0.0 diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py deleted file mode 100644 index a5a5ba82d..000000000 --- a/dangerzone/conversion/pixels_to_pdf.py +++ /dev/null @@ -1,161 +0,0 @@ -""" -Here are the steps, with progress bar percentages: - -- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages) -- 95%-100%: Compress the final PDF -""" - -import asyncio -import contextlib -import glob -import io -import json -import os -import sys -from typing import Optional - -from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes - -# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to -# stderr, but it's based on environment variables. These envvars are consulted at import -# time [3], so we have to set them here, before we import `fitz`. -# -# [1] https://github.com/freedomofpress/dangerzone/issues/877 -# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724 -# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63 -os.environ["PYMUPDF_MESSAGE"] = "fd:2" -os.environ["PYMUPDF_LOG"] = "fd:2" - - -class PixelsToPDF(DangerzoneConverter): - async def convert( - self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None - ) -> None: - self.percentage = 50.0 - if tempdir is None: - tempdir = "/safezone" - - # XXX lazy loading of fitz module to avoid import issues on non-Qubes systems - import fitz - - num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb")) - total_size = 0.0 - - safe_doc = fitz.Document() - - # Convert RGB files to PDF files - percentage_per_page = 45.0 / num_pages - for page_num in range(1, num_pages + 1): - filename_base = f"{tempdir}/pixels/page-{page_num}" - rgb_filename = f"{filename_base}.rgb" - width_filename = f"{filename_base}.width" - height_filename = f"{filename_base}.height" - - with open(width_filename) as f: - width = int(f.read().strip()) - with open(height_filename) as f: - height = int(f.read().strip()) - with open(rgb_filename, "rb") as rgb_f: - untrusted_rgb_data = rgb_f.read() - # The first few operations happen on a per-page basis. - page_size = len(untrusted_rgb_data) - total_size += page_size - pixmap = fitz.Pixmap( - fitz.Colorspace(fitz.CS_RGB), - width, - height, - untrusted_rgb_data, - False, - ) - pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI) - if ocr_lang: # OCR the document - self.update_progress( - f"Converting page {page_num}/{num_pages} from pixels to searchable PDF" - ) - if int(fitz.version[2]) >= 20230621000001: - page_pdf_bytes = pixmap.pdfocr_tobytes( - compress=True, - language=ocr_lang, - tessdata=get_tessdata_dir(), - ) - else: - # XXX: In PyMuPDF v1.22.5, the function signature of - # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument - # to explicitly set the Tesseract data dir [1]. - # - # In earlier versions, the PyMuPDF developers recommend setting this - # path via the TESSDATA_PREFIX environment variable. In practice, - # this environment variable is read at import time, so subsequent - # changes to the environment variable are not tracked [2]. - # - # To make things worse, any attempt to alter the internal attribute - # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using - # the OCR functions. That's due to the way imports work in `fitz`, - # where somehow the internal `fitz.fitz` module is shadowed. - # - # A hacky solution is to grab the `fitz.fitz` module from - # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can - # get away with this hack because we have a proper solution for - # subsequent PyMuPDF versions, and we know that nothing will change - # in older versions. - # - # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5 - # - # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save - # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308 - sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined] - - page_pdf_bytes = pixmap.pdfocr_tobytes( - compress=True, - language=ocr_lang, - ) - ocr_pdf = fitz.open("pdf", page_pdf_bytes) - else: # Don't OCR - self.update_progress( - f"Converting page {page_num}/{num_pages} from pixels to PDF" - ) - page_doc = fitz.Document() - page_doc.insert_file(pixmap) - page_pdf_bytes = page_doc.tobytes(deflate_images=True) - - safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes)) - self.percentage += percentage_per_page - - self.percentage = 100.0 - self.update_progress("Safe PDF created") - - # Move converted files into /safezone - if running_on_qubes(): - safe_pdf_path = f"{tempdir}/safe-output-compressed.pdf" - else: - safe_pdf_path = f"/safezone/safe-output-compressed.pdf" - - safe_doc.save(safe_pdf_path, deflate_images=True) - - def update_progress(self, text: str, *, error: bool = False) -> None: - if running_on_qubes(): - if self.progress_callback: - self.progress_callback(error, text, self.percentage) - else: - print( - json.dumps( - {"error": error, "text": text, "percentage": self.percentage} - ) - ) - sys.stdout.flush() - - -async def main() -> int: - ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None - converter = PixelsToPDF() - - try: - await converter.convert(ocr_lang) - return 0 - except (RuntimeError, ValueError) as e: - converter.update_progress(str(e), error=True) - return 1 - - -if __name__ == "__main__": - sys.exit(asyncio.run(main())) diff --git a/dangerzone/document.py b/dangerzone/document.py index 0b3e3f4f6..8af46f0be 100644 --- a/dangerzone/document.py +++ b/dangerzone/document.py @@ -123,6 +123,10 @@ def output_filename(self, filename: str) -> None: self.validate_output_filename(filename) self._output_filename = filename + @property + def sanitized_output_filename(self) -> str: + return util.replace_control_chars(self.output_filename) + @property def suffix(self) -> str: return self._suffix diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 0a46ce09a..9404cee25 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -5,25 +5,23 @@ import signal import subprocess import sys -import tempfile from abc import ABC, abstractmethod from pathlib import Path from typing import IO, Callable, Iterator, Optional +import fitz from colorama import Fore, Style from ..conversion import errors -from ..conversion.common import INT_BYTES +from ..conversion.common import DEFAULT_DPI, INT_BYTES from ..document import Document -from ..util import replace_control_chars +from ..util import get_tessdata_dir, replace_control_chars log = logging.getLogger(__name__) MAX_CONVERSION_LOG_CHARS = 150 * 50 # up to ~150 lines of 50 characters DOC_TO_PIXELS_LOG_START = "----- DOC TO PIXELS LOG START -----" DOC_TO_PIXELS_LOG_END = "----- DOC TO PIXELS LOG END -----" -PIXELS_TO_PDF_LOG_START = "----- PIXELS TO PDF LOG START -----" -PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----" TIMEOUT_EXCEPTION = 15 TIMEOUT_GRACE = 15 @@ -108,11 +106,8 @@ def convert( self.progress_callback = progress_callback document.mark_as_converting() try: - with tempfile.TemporaryDirectory() as t: - Path(f"{t}/pixels").mkdir() - with self.doc_to_pixels_proc(document) as conversion_proc: - self.doc_to_pixels(document, t, conversion_proc) - self.pixels_to_pdf(document, t, ocr_lang) + with self.doc_to_pixels_proc(document) as conversion_proc: + self.convert_with_proc(document, ocr_lang, conversion_proc) document.mark_as_safe() if document.archive_after_conversion: document.archive() @@ -126,8 +121,45 @@ def convert( self.print_progress(document, True, str(e), 0) document.mark_as_failed() - def doc_to_pixels( - self, document: Document, tempdir: str, p: subprocess.Popen + def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes: + """Get a single page as pixels, OCR it, and return a PDF as bytes.""" + return pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + tessdata=str(get_tessdata_dir()), + ) + + def pixels_to_pdf_page( + self, + untrusted_data: bytes, + untrusted_width: int, + untrusted_height: int, + ocr_lang: Optional[str], + ) -> fitz.Document: + """Convert a byte array of RGB pixels into a PDF page, optionally with OCR.""" + pixmap = fitz.Pixmap( + fitz.Colorspace(fitz.CS_RGB), + untrusted_width, + untrusted_height, + untrusted_data, + False, + ) + pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI) + + if ocr_lang: # OCR the document + page_pdf_bytes = self.ocr_page(pixmap, ocr_lang) + else: # Don't OCR + page_doc = fitz.Document() + page_doc.insert_file(pixmap) + page_pdf_bytes = page_doc.tobytes(deflate_images=True) + + return fitz.open("pdf", page_pdf_bytes) + + def convert_with_proc( + self, + document: Document, + ocr_lang: Optional[str], + p: subprocess.Popen, ) -> None: percentage = 0.0 with open(document.input_filename, "rb") as f: @@ -142,10 +174,15 @@ def doc_to_pixels( n_pages = read_int(p.stdout) if n_pages == 0 or n_pages > errors.MAX_PAGES: raise errors.MaxPagesException() - percentage_per_page = 49.0 / n_pages + step = 100 / n_pages + + safe_doc = fitz.Document() for page in range(1, n_pages + 1): - text = f"Converting page {page}/{n_pages} to pixels" + searchable = "searchable " if ocr_lang else "" + text = ( + f"Converting page {page}/{n_pages} from pixels to {searchable}PDF" + ) self.print_progress(document, False, text, percentage) width = read_int(p.stdout) @@ -161,39 +198,27 @@ def doc_to_pixels( num_pixels, ) - # Wrapper code - with open(f"{tempdir}/pixels/page-{page}.width", "w") as f_width: - f_width.write(str(width)) - with open(f"{tempdir}/pixels/page-{page}.height", "w") as f_height: - f_height.write(str(height)) - with open(f"{tempdir}/pixels/page-{page}.rgb", "wb") as f_rgb: - f_rgb.write(untrusted_pixels) + page_pdf = self.pixels_to_pdf_page( + untrusted_pixels, + width, + height, + ocr_lang, + ) + safe_doc.insert_pdf(page_pdf) - percentage += percentage_per_page + percentage += step # Ensure nothing else is read after all bitmaps are obtained p.stdout.close() - # TODO handle leftover code input - text = "Converted document to pixels" - self.print_progress(document, False, text, percentage) - - if getattr(sys, "dangerzone_dev", False): - assert p.stderr - debug_log = read_debug_text(p.stderr, MAX_CONVERSION_LOG_CHARS) - p.stderr.close() - log.info( - "Conversion output (doc to pixels)\n" - f"{DOC_TO_PIXELS_LOG_START}\n" - f"{debug_log}" # no need for an extra newline here - f"{DOC_TO_PIXELS_LOG_END}" - ) + # Saving it with a different name first, because PyMuPDF cannot handle + # non-Unicode chars. + safe_doc.save(document.sanitized_output_filename) + os.replace(document.sanitized_output_filename, document.output_filename) - @abstractmethod - def pixels_to_pdf( - self, document: Document, tempdir: str, ocr_lang: Optional[str] - ) -> None: - pass + # TODO handle leftover code input + text = "Successfully converted document" + self.print_progress(document, False, text, 100) def print_progress( self, document: Document, error: bool, text: str, percentage: float @@ -309,74 +334,15 @@ def doc_to_pixels_proc( document, p, timeout_grace=timeout_grace, timeout_force=timeout_force ) - -# From global_common: - -# def validate_convert_to_pixel_output(self, common, output): -# """ -# Take the output from the convert to pixels tasks and validate it. Returns -# a tuple like: (success (boolean), error_message (str)) -# """ -# max_image_width = 10000 -# max_image_height = 10000 - -# # Did we hit an error? -# for line in output.split("\n"): -# if ( -# "failed:" in line -# or "The document format is not supported" in line -# or "Error" in line -# ): -# return False, output - -# # How many pages was that? -# num_pages = None -# for line in output.split("\n"): -# if line.startswith("Document has "): -# num_pages = line.split(" ")[2] -# break -# if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0: -# return False, "Invalid number of pages returned" -# num_pages = int(num_pages) - -# # Make sure we have the files we expect -# expected_filenames = [] -# for i in range(1, num_pages + 1): -# expected_filenames += [ -# f"page-{i}.rgb", -# f"page-{i}.width", -# f"page-{i}.height", -# ] -# expected_filenames.sort() -# actual_filenames = os.listdir(common.pixel_dir.name) -# actual_filenames.sort() - -# if expected_filenames != actual_filenames: -# return ( -# False, -# f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}", -# ) - -# # Make sure the files are the correct sizes -# for i in range(1, num_pages + 1): -# with open(f"{common.pixel_dir.name}/page-{i}.width") as f: -# w_str = f.read().strip() -# with open(f"{common.pixel_dir.name}/page-{i}.height") as f: -# h_str = f.read().strip() -# w = int(w_str) -# h = int(h_str) -# if ( -# not w_str.isdigit() -# or not h_str.isdigit() -# or w <= 0 -# or w > max_image_width -# or h <= 0 -# or h > max_image_height -# ): -# return False, f"Page {i} has invalid geometry" - -# # Make sure the RGB file is the correct size -# if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3: -# return False, f"Page {i} has an invalid RGB file size" - -# return True, True + # Read the stderr of the process only if: + # * Dev mode is enabled. + # * The process has exited (else we risk hanging). + if getattr(sys, "dangerzone_dev", False) and p.poll() is not None: + assert p.stderr + debug_log = read_debug_text(p.stderr, MAX_CONVERSION_LOG_CHARS) + log.info( + "Conversion output (doc to pixels)\n" + f"{DOC_TO_PIXELS_LOG_START}\n" + f"{debug_log}" # no need for an extra newline here + f"{DOC_TO_PIXELS_LOG_END}" + ) diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index fe6626c17..b0f488049 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -1,24 +1,15 @@ import gzip -import json import logging import os import platform import shlex import shutil import subprocess -import sys -from typing import Any, List, Optional, Tuple +from typing import List, Tuple -from ..conversion import errors from ..document import Document -from ..util import get_tmp_dir # NOQA : required for mocking in our tests. from ..util import get_resource_path, get_subprocess_startupinfo -from .base import ( - PIXELS_TO_PDF_LOG_END, - PIXELS_TO_PDF_LOG_START, - IsolationProvider, - terminate_process_group, -) +from .base import IsolationProvider, terminate_process_group TIMEOUT_KILL = 5 # Timeout in seconds until the kill command returns. @@ -234,31 +225,6 @@ def pixels_to_pdf_container_name(self, document: Document) -> str: """Unique container name for the pixels-to-pdf phase.""" return f"dangerzone-pixels-to-pdf-{document.id}" - def assert_field_type(self, val: Any, _type: object) -> None: - # XXX: Use a stricter check than isinstance because `bool` is a subclass of - # `int`. - # - # See https://stackoverflow.com/a/37888668 - if type(val) is not _type: - raise ValueError("Status field has incorrect type") - - def parse_progress_trusted(self, document: Document, line: str) -> None: - """ - Parses a line returned by the container. - """ - try: - status = json.loads(line) - text = status["text"] - self.assert_field_type(text, str) - error = status["error"] - self.assert_field_type(error, bool) - percentage = status["percentage"] - self.assert_field_type(percentage, float) - self.print_progress(document, error, text, percentage) - except Exception: - error_message = f"Invalid JSON returned from container:\n\n\t {line}" - self.print_progress(document, True, error_message, -1) - def exec( self, args: List[str], @@ -337,84 +303,6 @@ def kill_container(self, name: str) -> None: f"Unexpected error occurred while killing container '{name}': {str(e)}" ) - def pixels_to_pdf( - self, document: Document, tempdir: str, ocr_lang: Optional[str] - ) -> None: - # Convert pixels to safe PDF - command = [ - "/usr/bin/python3", - "-m", - "dangerzone.conversion.pixels_to_pdf", - ] - extra_args = [ - "-v", - f"{tempdir}:/safezone:Z", - "-e", - f"OCR={0 if ocr_lang is None else 1}", - "-e", - f"OCR_LANGUAGE={ocr_lang}", - ] - # XXX: Until #748 gets merged, we have to run our pixels to PDF phase in a - # container, which involves mounting two temp dirs. This does not bode well with - # gVisor for two reasons: - # - # 1. Our gVisor integration chroot()s into /home/dangerzone/dangerzone-image/rootfs, - # meaning that the location of the temp dirs must be relevant to that path. - # 2. Reading and writing to these temp dirs requires permissions which are not - # available to the user within gVisor's user namespace. - # - # For these reasons, and because the pixels to PDF phase is more trusted (and - # will soon stop being containerized), we circumvent gVisor support by doing the - # following: - # - # 1. Override our entrypoint script with a no-op command (/usr/bin/env). - # 2. Set the PYTHONPATH so that we can import the Python code within - # /home/dangerzone/dangerzone-image/rootfs - # 3. Run the container as the root user, so that it can always write to the - # mounted directories. This container is trusted, so running as root has no - # impact to the security of Dangerzone. - img_root = "/home/dangerzone/dangerzone-image/rootfs" - extra_args += [ - "--entrypoint", - "/usr/bin/env", - "-e", - f"PYTHONPATH={img_root}/opt/dangerzone:{img_root}/usr/lib/python3.12/site-packages", - "-e", - f"TESSDATA_PREFIX={img_root}/usr/share/tessdata", - "-u", - "root", - ] - - name = self.pixels_to_pdf_container_name(document) - pixels_to_pdf_proc = self.exec_container(command, name, extra_args) - if pixels_to_pdf_proc.stdout: - for line in pixels_to_pdf_proc.stdout: - self.parse_progress_trusted(document, line.decode()) - error_code = pixels_to_pdf_proc.wait() - - # In case of a dev run, log everything from the second container. - if getattr(sys, "dangerzone_dev", False): - assert pixels_to_pdf_proc.stderr - out = pixels_to_pdf_proc.stderr.read().decode() - text = ( - f"Conversion output: (pixels to PDF)\n" - f"{PIXELS_TO_PDF_LOG_START}\n{out}\n{PIXELS_TO_PDF_LOG_END}" - ) - log.info(text) - - if error_code != 0: - log.error("pixels-to-pdf failed") - raise errors.exception_from_error_code(error_code) - else: - # Move the final file to the right place - if os.path.exists(document.output_filename): - os.remove(document.output_filename) - - container_output_filename = os.path.join( - tempdir, "safe-output-compressed.pdf" - ) - shutil.move(container_output_filename, document.output_filename) - def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen: # Convert document to pixels command = [ diff --git a/dangerzone/isolation_provider/dummy.py b/dangerzone/isolation_provider/dummy.py index 80dd17fcf..9ebc345fa 100644 --- a/dangerzone/isolation_provider/dummy.py +++ b/dangerzone/isolation_provider/dummy.py @@ -1,18 +1,25 @@ import logging -import os -import shutil import subprocess import sys -import time -from typing import Callable, Optional +from ..conversion.common import DangerzoneConverter from ..document import Document -from ..util import get_resource_path from .base import IsolationProvider, terminate_process_group log = logging.getLogger(__name__) +def dummy_script() -> None: + sys.stdin.buffer.read() + pages = 2 + width = height = 9 + DangerzoneConverter._write_int(pages) + for page in range(pages): + DangerzoneConverter._write_int(width) + DangerzoneConverter._write_int(height) + DangerzoneConverter._write_bytes(width * height * 3 * b"A") + + class Dummy(IsolationProvider): """Dummy Isolation Provider (FOR TESTING ONLY) @@ -32,51 +39,15 @@ def __init__(self) -> None: def install(self) -> bool: return True - def convert( - self, - document: Document, - ocr_lang: Optional[str], - progress_callback: Optional[Callable] = None, - ) -> None: - self.progress_callback = None - log.debug("Dummy converter started:") - log.debug( - f" - document: {os.path.basename(document.input_filename)} ({document.id})" - ) - log.debug(f" - ocr : {ocr_lang}") - log.debug("\n(simulating conversion)") - success = True - progress = [ - [False, "Converting to PDF using GraphicsMagick", 0.0], - [False, "Separating document into pages", 3.0], - [False, "Converting page 1/1 to pixels", 5.0], - [False, "Converted document to pixels", 50.0], - [False, "Converting page 1/1 from pixels to PDF", 50.0], - [False, "Merging 1 pages into a single PDF", 95.0], - [False, "Compressing PDF", 97.0], - [False, "Safe PDF created", 100.0], - ] - for error, text, percentage in progress: - self.print_progress(document, error, text, percentage) # type: ignore [arg-type] - if error: - success = False - if success: - shutil.copy( - get_resource_path("dummy_document.pdf"), document.output_filename - ) - document.mark_as_safe() - if document.archive_after_conversion: - document.archive() - - def pixels_to_pdf( - self, document: Document, tempdir: str, ocr_lang: Optional[str] - ) -> None: - pass - def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen: - dummy_cmd = ["python3", "-c", "print('The cake is a lie')"] + cmd = [ + sys.executable, + "-c", + "from dangerzone.isolation_provider.dummy import dummy_script;" + " dummy_script()", + ] return subprocess.Popen( - dummy_cmd, + cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=self.proc_stderr, diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index 34cc9004a..dd1f18169 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -1,20 +1,16 @@ -import asyncio import io import logging import os -import shutil import subprocess import sys import zipfile from pathlib import Path -from typing import IO, Optional +from typing import IO -from ..conversion import errors from ..conversion.common import running_on_qubes -from ..conversion.pixels_to_pdf import PixelsToPDF from ..document import Document from ..util import get_resource_path -from .base import PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, IsolationProvider +from .base import IsolationProvider log = logging.getLogger(__name__) @@ -25,28 +21,6 @@ class Qubes(IsolationProvider): def install(self) -> bool: return True - def pixels_to_pdf( - self, document: Document, tempdir: str, ocr_lang: Optional[str] - ) -> None: - def print_progress_wrapper(error: bool, text: str, percentage: float) -> None: - self.print_progress(document, error, text, percentage) - - converter = PixelsToPDF(progress_callback=print_progress_wrapper) - try: - asyncio.run(converter.convert(ocr_lang, tempdir)) - except (RuntimeError, ValueError) as e: - raise errors.UnexpectedConversionError(str(e)) - finally: - if getattr(sys, "dangerzone_dev", False): - out = converter.captured_output.decode() - text = ( - f"Conversion output: (pixels to PDF)\n" - f"{PIXELS_TO_PDF_LOG_START}\n{out}{PIXELS_TO_PDF_LOG_END}" - ) - log.info(text) - - shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) - def get_max_parallel_conversions(self) -> int: return 1 diff --git a/dangerzone/util.py b/dangerzone/util.py index 311288c7a..5b2878b1c 100644 --- a/dangerzone/util.py +++ b/dangerzone/util.py @@ -3,7 +3,6 @@ import subprocess import sys import unicodedata -from typing import Optional import appdirs @@ -12,17 +11,6 @@ def get_config_dir() -> str: return appdirs.user_config_dir("dangerzone") -def get_tmp_dir() -> Optional[str]: - """Get the parent dir for the Dangerzone temporary dirs. - - This function returns the parent directory where Dangerzone will store its temporary - directories. The default behavior is to let Python choose for us (e.g., in `/tmp` - for Linux), which is why we return None. However, we still need to define this - function in order to be able to set this dir via mocking in our tests. - """ - return None - - def get_resource_path(filename: str) -> str: if getattr(sys, "dangerzone_dev", False): # Look for resources directory relative to python file @@ -45,6 +33,38 @@ def get_resource_path(filename: str) -> str: return str(resource_path) +def get_tessdata_dir() -> pathlib.Path: + if getattr(sys, "dangerzone_dev", False) or platform.system() in ( + "Windows", + "Darwin", + ): + # Always use the tessdata path from the Dangerzone ./share directory, for + # development builds, or in Windows/macOS platforms. + return pathlib.Path(get_resource_path("tessdata")) + + # In case of Linux systems, grab the Tesseract data from any of the following + # locations. We have found some of the locations through trial and error, whereas + # others are taken from the docs: + # + # [...] Possibilities are /usr/share/tesseract-ocr/tessdata or + # /usr/share/tessdata or /usr/share/tesseract-ocr/4.00/tessdata. [1] + # + # [1] https://tesseract-ocr.github.io/tessdoc/Installation.html + tessdata_dirs = [ + pathlib.Path("/usr/share/tessdata/"), # on some Debian + pathlib.Path("/usr/share/tesseract/tessdata/"), # on Fedora + pathlib.Path("/usr/share/tesseract-ocr/tessdata/"), # ? (documented) + pathlib.Path("/usr/share/tesseract-ocr/4.00/tessdata/"), # on Ubuntu Focal + pathlib.Path("/usr/share/tesseract-ocr/5/tessdata/"), # on Debian Trixie + ] + + for dir in tessdata_dirs: + if dir.is_dir(): + return dir + + raise RuntimeError("Tesseract language data are not installed in the system") + + def get_version() -> str: try: with open(get_resource_path("version.txt")) as f: diff --git a/debian/control b/debian/control index 133883b00..da9eb4a25 100644 --- a/debian/control +++ b/debian/control @@ -9,7 +9,7 @@ Rules-Requires-Root: no Package: dangerzone Architecture: any -Depends: ${misc:Depends}, ${python3:Depends}, podman, python3, python3-pyside2.qtcore, python3-pyside2.qtgui, python3-pyside2.qtwidgets, python3-pyside2.qtsvg, python3-appdirs, python3-click, python3-xdg, python3-colorama, python3-requests, python3-markdown, python3-packaging +Depends: ${misc:Depends}, ${python3:Depends}, podman, python3, python3-pyside2.qtcore, python3-pyside2.qtgui, python3-pyside2.qtwidgets, python3-pyside2.qtsvg, python3-appdirs, python3-click, python3-xdg, python3-colorama, python3-requests, python3-markdown, python3-packaging, tesseract-ocr-all Description: Take potentially dangerous PDFs, office documents, or images Dangerzone is an open source desktop application that takes potentially dangerous PDFs, office documents, or images and converts them to safe PDFs. It uses disposable VMs on Qubes OS, or container technology in other OSes, to convert the documents within a secure sandbox. . diff --git a/debian/source/options b/debian/source/options index 57156b104..b394f06f1 100644 --- a/debian/source/options +++ b/debian/source/options @@ -1,4 +1,7 @@ compression = "gzip" tar-ignore = "dev_scripts" tar-ignore = ".*" -tar-ignore = "__pycache__" \ No newline at end of file +tar-ignore = "__pycache__" +# Ignore the 'share/tessdata' dir, since it slows down the process, and we +# install Tesseract data via Debian packages anyway. +tar-ignore = "share/tessdata" diff --git a/dev_scripts/qa.py b/dev_scripts/qa.py index 41fcb06e9..5790e296c 100755 --- a/dev_scripts/qa.py +++ b/dev_scripts/qa.py @@ -289,6 +289,12 @@ python3 ./install/common/build-image.py ``` +Download the OCR language data: + +```sh +python3 ./install/common/download-tessdata.py +``` + Run from source tree: ```sh @@ -367,6 +373,12 @@ python3 ./install/common/build-image.py ``` +Download the OCR language data: + +```sh +python3 ./install/common/download-tessdata.py +``` + Run from source tree: ```sh @@ -425,6 +437,12 @@ python3 .\install\common\build-image.py ``` +Download the OCR language data: + +```sh +python3 .\install\common\download-tessdata.py +``` + After that you can launch dangerzone during development with: ``` diff --git a/install/common/download-tessdata.py b/install/common/download-tessdata.py new file mode 100644 index 000000000..cf2b3660f --- /dev/null +++ b/install/common/download-tessdata.py @@ -0,0 +1,92 @@ +import hashlib +import io +import json +import logging +import pathlib +import subprocess +import sys +import tarfile +import urllib.request + +logger = logging.getLogger(__name__) + +TESSDATA_RELEASES_URL = ( + "https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest" +) +TESSDATA_ARCHIVE_URL = "https://github.com/tesseract-ocr/tessdata_fast/archive/{tessdata_version}/tessdata_fast-{tessdata_version}.tar.gz" +TESSDATA_CHECKSUM = "d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9" + + +def git_root(): + """Get the root directory of the Git repo.""" + # FIXME: Use a Git Python binding for this. + # FIXME: Make this work if called outside the repo. + cmd = ["git", "rev-parse", "--show-toplevel"] + path = ( + subprocess.run(cmd, check=True, stdout=subprocess.PIPE) + .stdout.decode() + .strip("\n") + ) + return pathlib.Path(path) + + +def main(): + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + share_dir = git_root() / "share" + tessdata_dir = share_dir / "tessdata" + + # Get the list of OCR languages that Dangerzone supports. + with open(share_dir / "ocr-languages.json") as f: + langs_short = sorted(json.loads(f.read()).values()) + + # Check if these languages have already been downloaded. + if tessdata_dir.exists(): + expected_files = {f"{lang}.traineddata" for lang in langs_short} + files = {f.name for f in tessdata_dir.iterdir()} + if files == expected_files: + logger.info("Skipping tessdata download, language data already exists") + return + else: + logger.info(f"Found {tessdata_dir} but contents do not match") + return 1 + + # Get latest release of Tesseract data. + logger.info("Getting latest tessdata release") + with urllib.request.urlopen(TESSDATA_RELEASES_URL) as f: + resp = f.read() + releases = json.loads(resp) + tag = releases["tag_name"] + + # Get latest release of Tesseract data. + logger.info(f"Downloading tessdata release {tag}") + archive_url = TESSDATA_ARCHIVE_URL.format(tessdata_version=tag) + with urllib.request.urlopen(archive_url) as f: + archive = f.read() + digest = hashlib.sha256(archive).hexdigest() + if digest != TESSDATA_CHECKSUM: + raise RuntimeError(f"Checksum mismatch {digest} != {TESSDATA_CHECKSUM}") + + # Extract the languages models from the tessdata archive. + logger.info(f"Extracting tessdata archive into {tessdata_dir}") + with tarfile.open(fileobj=io.BytesIO(archive)) as t: + for lang in langs_short: + member = f"tessdata_fast-{tag}/{lang}.traineddata" + logger.info(f"Extracting {member}") + # NOTE: We want `filter="data"` because it ignores ownership info, as + # recorded in the tarfile. This filter will become the default in Python + # 3.14. See: + # + # https://docs.python.org/3/library/tarfile.html#tarfile-extraction-filter + t.extract(member=member, path=share_dir, filter="data") + + tessdata_dl_dir = share_dir / f"tessdata_fast-{tag}" + tessdata_dl_dir.rename(tessdata_dir) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/install/linux/build-rpm.py b/install/linux/build-rpm.py index 56f5ab224..1cec7d2c2 100755 --- a/install/linux/build-rpm.py +++ b/install/linux/build-rpm.py @@ -64,10 +64,15 @@ def build(build_dir, qubes=False): os.symlink(dist_path, srpm_dir) print("* Creating a Python sdist") + tessdata = root / "share" / "tessdata" + tessdata_bak = root / "tessdata.bak" container_tar_gz = root / "share" / "container.tar.gz" container_tar_gz_bak = root / "container.tar.gz.bak" + + if tessdata.exists(): + tessdata.rename(tessdata_bak) stash_container = qubes and container_tar_gz.exists() - if stash_container: + if stash_container and container_tar_gz.exists(): container_tar_gz.rename(container_tar_gz_bak) try: subprocess.run(["poetry", "build", "-f", "sdist"], cwd=root, check=True) @@ -77,7 +82,9 @@ def build(build_dir, qubes=False): shutil.copy2(sdist_path, build_dir / "SOURCES" / sdist_name) sdist_path.unlink() finally: - if stash_container: + if tessdata_bak.exists(): + tessdata_bak.rename(tessdata) + if stash_container and container_tar_gz_bak.exists(): container_tar_gz_bak.rename(container_tar_gz) print("* Building RPM package") diff --git a/install/linux/dangerzone.spec b/install/linux/dangerzone.spec index fca05fe13..996f9a776 100644 --- a/install/linux/dangerzone.spec +++ b/install/linux/dangerzone.spec @@ -72,13 +72,12 @@ BuildRequires: python3-devel %if 0%{?_qubes} # Qubes-only requirements (server-side) Requires: python3-magic -Requires: python3-PyMuPDF Requires: libreoffice -# Qubes-only requirements (client-side) -Requires: GraphicsMagick -Requires: ghostscript -Requires: poppler-utils -Requires: tesseract +%else +# Container-only requirements +Requires: podman +%endif + # Explicitly require every tesseract model: # See: https://github.com/freedomofpress/dangerzone/issues/431 Requires: tesseract-langpack-afr @@ -204,10 +203,6 @@ Requires: tesseract-langpack-uzb_cyrl Requires: tesseract-langpack-vie Requires: tesseract-langpack-yid Requires: tesseract-langpack-yor -%else -# Container-only requirements -Requires: podman -%endif %description Dangerzone is an open source desktop application that takes potentially diff --git a/poetry.lock b/poetry.lock index 8d3e49c99..ec22a9ab4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1046,4 +1046,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "9f1c256ac7a768845d519e58206bb3021be7fca94a55c29534cb7a157609e4e8" +content-hash = "9b4083a41b94d03d7688fb6fcc82a0af9839f4dfc769204548c44bf35bcda60b" diff --git a/pyproject.toml b/pyproject.toml index d240ceb9c..d050f103c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ python = ">=3.9,<3.13" click = "*" appdirs = "*" PySide6 = "^6.7.1" +PyMuPDF = "^1.23.3" # The version in Fedora 39 colorama = "*" pyxdg = {version = "*", platform = "linux"} requests = "*" @@ -51,9 +52,6 @@ pytest-qt = "^4.2.0" pytest-cov = "^5.0.0" strip-ansi = "*" -[tool.poetry.group.qubes.dependencies] -pymupdf = "^1.23.6" - [tool.poetry.group.container.dependencies] pymupdf = "^1.24.10" diff --git a/tests/isolation_provider/base.py b/tests/isolation_provider/base.py index abf488961..6ba76ceee 100644 --- a/tests/isolation_provider/base.py +++ b/tests/isolation_provider/base.py @@ -29,7 +29,7 @@ def test_max_pages_server_enforcement( p = provider.start_doc_to_pixels_proc(doc) with pytest.raises(errors.ConverterProcException): - provider.doc_to_pixels(doc, tmpdir, p) + provider.convert_with_proc(doc, None, p) assert provider.get_proc_exception(p) == errors.MaxPagesException def test_max_pages_client_enforcement( @@ -46,7 +46,7 @@ def test_max_pages_client_enforcement( doc = Document(sample_doc) p = provider.start_doc_to_pixels_proc(doc) with pytest.raises(errors.MaxPagesException): - provider.doc_to_pixels(doc, tmpdir, p) + provider.convert_with_proc(doc, None, p) def test_max_dimensions( self, @@ -60,12 +60,12 @@ def test_max_dimensions( doc = Document(sample_bad_width) p = provider.start_doc_to_pixels_proc(doc) with pytest.raises(errors.MaxPageWidthException): - provider.doc_to_pixels(doc, tmpdir, p) + provider.convert_with_proc(doc, None, p) doc = Document(sample_bad_height) p = provider.start_doc_to_pixels_proc(doc) with pytest.raises(errors.MaxPageHeightException): - provider.doc_to_pixels(doc, tmpdir, p) + provider.convert_with_proc(doc, None, p) class IsolationProviderTermination: diff --git a/tests/isolation_provider/test_container.py b/tests/isolation_provider/test_container.py index 860cf9f54..48f8d68fb 100644 --- a/tests/isolation_provider/test_container.py +++ b/tests/isolation_provider/test_container.py @@ -21,31 +21,6 @@ def provider() -> Container: return Container() -class ContainerWait(Container): - """Container isolation provider that blocks until the container has started.""" - - def exec_container(self, *args, **kwargs): # type: ignore [no-untyped-def] - # Check every 100ms if a container with the expected name has showed up. - # Else, closing the file descriptors may not work. - name = kwargs["name"] - runtime = self.get_runtime() - p = super().exec_container(*args, **kwargs) - for i in range(50): - containers = subprocess.run( - [runtime, "ps"], capture_output=True - ).stdout.decode() - if name in containers: - return p - time.sleep(0.1) - - raise RuntimeError(f"Container {name} did not start within 5 seconds") - - -@pytest.fixture -def provider_wait() -> ContainerWait: - return ContainerWait() - - class TestContainer(IsolationProviderTest): pass diff --git a/tests/isolation_provider/test_dummy.py b/tests/isolation_provider/test_dummy.py index 100268769..198a6afb1 100644 --- a/tests/isolation_provider/test_dummy.py +++ b/tests/isolation_provider/test_dummy.py @@ -1,11 +1,9 @@ import os -import subprocess import pytest from pytest_mock import MockerFixture from dangerzone.conversion import errors -from dangerzone.document import Document from dangerzone.isolation_provider.base import IsolationProvider from dangerzone.isolation_provider.dummy import Dummy @@ -16,24 +14,6 @@ pytest.skip("Dummy conversion is not enabled", allow_module_level=True) -class DummyWait(Dummy): - """Dummy isolation provider that spawns a blocking process.""" - - def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen: - return subprocess.Popen( - ["python3"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - start_new_session=True, - ) - - -@pytest.fixture -def provider_wait() -> DummyWait: - return DummyWait() - - @pytest.fixture def provider() -> Dummy: return Dummy() @@ -42,21 +22,12 @@ def provider() -> Dummy: class TestDummyTermination(IsolationProviderTermination): def test_failed( self, - provider_wait: IsolationProvider, + provider: IsolationProvider, mocker: MockerFixture, ) -> None: mocker.patch.object( - provider_wait, + provider, "get_proc_exception", return_value=errors.DocFormatUnsupported(), ) - super().test_failed(provider_wait, mocker) - - def test_linger_unkillable( - self, - provider_wait: IsolationProvider, - mocker: MockerFixture, - ) -> None: - # We have to spawn a blocking process here, else we can't imitate an - # "unkillable" process. - super().test_linger_unkillable(provider_wait, mocker) + super().test_failed(provider, mocker) diff --git a/tests/isolation_provider/test_qubes.py b/tests/isolation_provider/test_qubes.py index 77ea93962..06ca112d2 100644 --- a/tests/isolation_provider/test_qubes.py +++ b/tests/isolation_provider/test_qubes.py @@ -20,11 +20,6 @@ pytest.skip("Dummy conversion is enabled", allow_module_level=True) -@pytest.fixture -def provider() -> Qubes: - return Qubes() - - class QubesWait(Qubes): """Qubes isolation provider that blocks until the disposable qube has started.""" @@ -53,7 +48,7 @@ def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen: @pytest.fixture -def provider_wait() -> QubesWait: +def provider() -> QubesWait: return QubesWait() @@ -79,7 +74,7 @@ def test_out_of_ram( ) with pytest.raises(errors.ConverterProcException): doc = Document(sample_doc) - provider.doc_to_pixels(doc, tmpdir, proc) + provider.convert_with_proc(doc, None, proc) assert provider.get_proc_exception(proc) == errors.QubesQrexecFailed diff --git a/tests/test_cli.py b/tests/test_cli.py index f79a8cf93..df4fe676f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -10,16 +10,15 @@ import traceback from pathlib import Path from typing import Optional, Sequence -from unittest import mock import pytest from click.testing import CliRunner, Result +from pytest_mock import MockerFixture from strip_ansi import strip_ansi from dangerzone.cli import cli_main, display_banner from dangerzone.document import ARCHIVE_SUBDIR, SAFE_EXTENSION from dangerzone.isolation_provider.qubes import is_qubes_native_conversion -from dangerzone.util import get_resource_path from .conftest import for_each_doc, for_each_external_doc @@ -134,29 +133,19 @@ def run_cli( if os.environ.get("DUMMY_CONVERSION", False): args = ("--unsafe-dummy-conversion", *args) - with tempfile.TemporaryDirectory() as t: - tmp_dir = Path(t) - # TODO: Replace this with `contextlib.chdir()` [1], which was added in - # Python 3.11. - # - # [1]: https://docs.python.org/3/library/contextlib.html#contextlib.chdir - try: - if tmp_path is not None: - cwd = os.getcwd() - os.chdir(tmp_path) - - with mock.patch( - "dangerzone.isolation_provider.container.get_tmp_dir", - return_value=t, - ): - result = CliRunner().invoke(cli_main, args) - finally: - if tmp_path is not None: - os.chdir(cwd) - - if tmp_dir.exists(): - stale_files = list(tmp_dir.iterdir()) - assert not stale_files + # TODO: Replace this with `contextlib.chdir()` [1], which was added in + # Python 3.11. + # + # [1]: https://docs.python.org/3/library/contextlib.html#contextlib.chdir + try: + if tmp_path is not None: + cwd = os.getcwd() + os.chdir(tmp_path) + + result = CliRunner().invoke(cli_main, args) + finally: + if tmp_path is not None: + os.chdir(cwd) # XXX Print stdout so that junitXML exports with output capturing # actually include the stdout + stderr (they are combined into stdout) @@ -221,17 +210,20 @@ def test_output_filename_uncommon( result.assert_success() ### Test method for swallowed exception - def test_output_filename_same_file_dummy_fails(self) -> None: - resource_path = get_resource_path("dummy_document.pdf") - # Using the same filename for both input and output should fail. - result = self.run_cli( - [ - resource_path, - "--output-filename", - resource_path, - "--unsafe-dummy-conversion", - ] + def test_output_filename_pokemon_handler( + self, + sample_pdf: str, + mocker: MockerFixture, + ) -> None: + """Ensure that we catch top-level errors.""" + mock_conv = mocker.patch( + "dangerzone.isolation_provider.base.IsolationProvider.convert" ) + mock_conv.side_effect = Exception("It happens") + result = self.run_cli([sample_pdf]) + # FIXME: The following does not work, because the log is somehow not captured by + # Click's CliRunner. + # result.assert_failure(message="It happens") result.assert_failure() def test_output_filename_new_dir(self, sample_pdf: str) -> None: diff --git a/tests/test_ocr.py b/tests/test_ocr.py index 29d50f803..2b8836fa1 100644 --- a/tests/test_ocr.py +++ b/tests/test_ocr.py @@ -1,59 +1,16 @@ -import platform -import subprocess -from pathlib import Path - -import pytest - -from dangerzone.isolation_provider.container import Container -from dangerzone.isolation_provider.qubes import is_qubes_native_conversion +from dangerzone.isolation_provider.dummy import Dummy from dangerzone.logic import DangerzoneCore +from dangerzone.util import get_tessdata_dir -# TODO: Perform an equivalent test on Qubes. -# NOTE: We skip running this test on Windows/MacOS, because our current CI cannot run -# Docker in these platforms. It's not a problem anyways, because the result should be -# the same in all container-based platforms. -@pytest.mark.skipif( - platform.system() != "Linux" or is_qubes_native_conversion(), - reason="Container-specific", -) -def test_ocr_omissions() -> None: - # Create the command that will list all the installed languages in the container - # image. - command = [Container.get_runtime(), "run"] - command += Container.get_runtime_security_args() - command += [ - Container.CONTAINER_NAME, - "find", - "/usr/share/tessdata/", - "-name", - "*.traineddata", - ] - - # Run the command, strip any extra whitespace, and remove the following first line - # from the result: - # - # List of available languages in "/usr/share/tessdata/" ... - installed_langs_filenames = ( - subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE) - .stdout.strip() - .split("\n") - ) - installed_langs = set( - [ - Path(filename).name.split(".traineddata")[0] - for filename in installed_langs_filenames - ] - ) - - # Remove the "osd" and "equ" languages from the list of installed languages, since - # they are not an actual language. Read more in: - # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/ - installed_langs -= {"osd", "equ"} +def test_ocr_ommisions() -> None: + # Grab the languages that are available in the Tesseract data dir. + suffix_len = len(".traineddata") + available_langs = {f.name[:-suffix_len] for f in get_tessdata_dir().iterdir()} # Grab the languages that Dangerzone offers to the user through the GUI/CLI. - offered_langs = set(DangerzoneCore(Container()).ocr_languages.values()) + offered_langs = set(DangerzoneCore(Dummy()).ocr_languages.values()) - # Ensure that both the installed languages and the ones we offer to the user are the + # Ensure that both the available languages and the ones we offer to the user are the # same. - assert installed_langs == offered_langs + assert available_langs == offered_langs