From 1ab3aab08e28240122f3a87d9609fa1760d1e2e6 Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Thu, 14 Mar 2024 10:46:10 +0200 Subject: [PATCH] Remove dead code --- Dockerfile | 25 ---- dangerzone/conversion/common.py | 9 -- dangerzone/conversion/pixels_to_pdf.py | 161 --------------------- dangerzone/isolation_provider/base.py | 77 ---------- dangerzone/isolation_provider/container.py | 116 +-------------- dangerzone/isolation_provider/qubes.py | 30 +--- dangerzone/util.py | 12 -- tests/test_cli.py | 36 ++--- 8 files changed, 17 insertions(+), 449 deletions(-) delete mode 100644 dangerzone/conversion/pixels_to_pdf.py diff --git a/Dockerfile b/Dockerfile index 3b5c565a7..83896ebec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,30 +21,6 @@ RUN case "$ARCH" in \ RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt -########################################### -# Download Tesseract data - -FROM alpine:latest as tessdata-dl -ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9 - -# Download the trained models from the latest GitHub release of Tesseract, and -# store them under /usr/share/tessdata. This is basically what distro packages -# do under the hood. -# -# Because the GitHub release contains more files than just the trained models, -# we use `find` to fetch only the '*.traineddata' files in the top directory. -# -# Before we untar the models, we also check if the checksum is the expected one. -RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \ - && TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \ - | sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \ - && wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \ - && echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \ - && tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \ - && find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \ - && cd .. && rm -r tessdata - - ########################################### # Download H2ORestart FROM alpine:latest as h2orestart-dl @@ -74,7 +50,6 @@ RUN apk --no-cache -U upgrade && \ COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs -COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext RUN install -dm777 "/usr/lib/libreoffice/share/extensions/" diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 882b1ec27..0940a5661 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -13,15 +13,6 @@ def running_on_qubes() -> bool: return os.path.exists("/usr/share/qubes/marker-vm") -def get_tessdata_dir() -> str: - if os.environ.get("TESSDATA_PREFIX"): - return os.environ["TESSDATA_PREFIX"] - elif running_on_qubes(): - return "/usr/share/tesseract/tessdata/" - else: - return "/usr/share/tessdata/" - - class DangerzoneConverter: def __init__(self, progress_callback: Optional[Callable] = None) -> None: self.percentage: float = 0.0 diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py deleted file mode 100644 index a5a5ba82d..000000000 --- a/dangerzone/conversion/pixels_to_pdf.py +++ /dev/null @@ -1,161 +0,0 @@ -""" -Here are the steps, with progress bar percentages: - -- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages) -- 95%-100%: Compress the final PDF -""" - -import asyncio -import contextlib -import glob -import io -import json -import os -import sys -from typing import Optional - -from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes - -# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to -# stderr, but it's based on environment variables. These envvars are consulted at import -# time [3], so we have to set them here, before we import `fitz`. -# -# [1] https://github.com/freedomofpress/dangerzone/issues/877 -# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724 -# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63 -os.environ["PYMUPDF_MESSAGE"] = "fd:2" -os.environ["PYMUPDF_LOG"] = "fd:2" - - -class PixelsToPDF(DangerzoneConverter): - async def convert( - self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None - ) -> None: - self.percentage = 50.0 - if tempdir is None: - tempdir = "/safezone" - - # XXX lazy loading of fitz module to avoid import issues on non-Qubes systems - import fitz - - num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb")) - total_size = 0.0 - - safe_doc = fitz.Document() - - # Convert RGB files to PDF files - percentage_per_page = 45.0 / num_pages - for page_num in range(1, num_pages + 1): - filename_base = f"{tempdir}/pixels/page-{page_num}" - rgb_filename = f"{filename_base}.rgb" - width_filename = f"{filename_base}.width" - height_filename = f"{filename_base}.height" - - with open(width_filename) as f: - width = int(f.read().strip()) - with open(height_filename) as f: - height = int(f.read().strip()) - with open(rgb_filename, "rb") as rgb_f: - untrusted_rgb_data = rgb_f.read() - # The first few operations happen on a per-page basis. - page_size = len(untrusted_rgb_data) - total_size += page_size - pixmap = fitz.Pixmap( - fitz.Colorspace(fitz.CS_RGB), - width, - height, - untrusted_rgb_data, - False, - ) - pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI) - if ocr_lang: # OCR the document - self.update_progress( - f"Converting page {page_num}/{num_pages} from pixels to searchable PDF" - ) - if int(fitz.version[2]) >= 20230621000001: - page_pdf_bytes = pixmap.pdfocr_tobytes( - compress=True, - language=ocr_lang, - tessdata=get_tessdata_dir(), - ) - else: - # XXX: In PyMuPDF v1.22.5, the function signature of - # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument - # to explicitly set the Tesseract data dir [1]. - # - # In earlier versions, the PyMuPDF developers recommend setting this - # path via the TESSDATA_PREFIX environment variable. In practice, - # this environment variable is read at import time, so subsequent - # changes to the environment variable are not tracked [2]. - # - # To make things worse, any attempt to alter the internal attribute - # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using - # the OCR functions. That's due to the way imports work in `fitz`, - # where somehow the internal `fitz.fitz` module is shadowed. - # - # A hacky solution is to grab the `fitz.fitz` module from - # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can - # get away with this hack because we have a proper solution for - # subsequent PyMuPDF versions, and we know that nothing will change - # in older versions. - # - # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5 - # - # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save - # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308 - sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined] - - page_pdf_bytes = pixmap.pdfocr_tobytes( - compress=True, - language=ocr_lang, - ) - ocr_pdf = fitz.open("pdf", page_pdf_bytes) - else: # Don't OCR - self.update_progress( - f"Converting page {page_num}/{num_pages} from pixels to PDF" - ) - page_doc = fitz.Document() - page_doc.insert_file(pixmap) - page_pdf_bytes = page_doc.tobytes(deflate_images=True) - - safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes)) - self.percentage += percentage_per_page - - self.percentage = 100.0 - self.update_progress("Safe PDF created") - - # Move converted files into /safezone - if running_on_qubes(): - safe_pdf_path = f"{tempdir}/safe-output-compressed.pdf" - else: - safe_pdf_path = f"/safezone/safe-output-compressed.pdf" - - safe_doc.save(safe_pdf_path, deflate_images=True) - - def update_progress(self, text: str, *, error: bool = False) -> None: - if running_on_qubes(): - if self.progress_callback: - self.progress_callback(error, text, self.percentage) - else: - print( - json.dumps( - {"error": error, "text": text, "percentage": self.percentage} - ) - ) - sys.stdout.flush() - - -async def main() -> int: - ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None - converter = PixelsToPDF() - - try: - await converter.convert(ocr_lang) - return 0 - except (RuntimeError, ValueError) as e: - converter.update_progress(str(e), error=True) - return 1 - - -if __name__ == "__main__": - sys.exit(asyncio.run(main())) diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 43431eaec..d58587202 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -221,12 +221,6 @@ def _convert( text = "Converted document" self.print_progress(document, False, text, percentage) - @abstractmethod - def pixels_to_pdf( - self, document: Document, tempdir: str, ocr_lang: Optional[str] - ) -> None: - pass - def print_progress( self, document: Document, error: bool, text: str, percentage: float ) -> None: @@ -353,74 +347,3 @@ def doc_to_pixels_proc( f"{debug_log}" # no need for an extra newline here f"{DOC_TO_PIXELS_LOG_END}" ) - -# From global_common: - -# def validate_convert_to_pixel_output(self, common, output): -# """ -# Take the output from the convert to pixels tasks and validate it. Returns -# a tuple like: (success (boolean), error_message (str)) -# """ -# max_image_width = 10000 -# max_image_height = 10000 - -# # Did we hit an error? -# for line in output.split("\n"): -# if ( -# "failed:" in line -# or "The document format is not supported" in line -# or "Error" in line -# ): -# return False, output - -# # How many pages was that? -# num_pages = None -# for line in output.split("\n"): -# if line.startswith("Document has "): -# num_pages = line.split(" ")[2] -# break -# if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0: -# return False, "Invalid number of pages returned" -# num_pages = int(num_pages) - -# # Make sure we have the files we expect -# expected_filenames = [] -# for i in range(1, num_pages + 1): -# expected_filenames += [ -# f"page-{i}.rgb", -# f"page-{i}.width", -# f"page-{i}.height", -# ] -# expected_filenames.sort() -# actual_filenames = os.listdir(common.pixel_dir.name) -# actual_filenames.sort() - -# if expected_filenames != actual_filenames: -# return ( -# False, -# f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}", -# ) - -# # Make sure the files are the correct sizes -# for i in range(1, num_pages + 1): -# with open(f"{common.pixel_dir.name}/page-{i}.width") as f: -# w_str = f.read().strip() -# with open(f"{common.pixel_dir.name}/page-{i}.height") as f: -# h_str = f.read().strip() -# w = int(w_str) -# h = int(h_str) -# if ( -# not w_str.isdigit() -# or not h_str.isdigit() -# or w <= 0 -# or w > max_image_width -# or h <= 0 -# or h > max_image_height -# ): -# return False, f"Page {i} has invalid geometry" - -# # Make sure the RGB file is the correct size -# if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3: -# return False, f"Page {i} has an invalid RGB file size" - -# return True, True diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index fe6626c17..b0f488049 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -1,24 +1,15 @@ import gzip -import json import logging import os import platform import shlex import shutil import subprocess -import sys -from typing import Any, List, Optional, Tuple +from typing import List, Tuple -from ..conversion import errors from ..document import Document -from ..util import get_tmp_dir # NOQA : required for mocking in our tests. from ..util import get_resource_path, get_subprocess_startupinfo -from .base import ( - PIXELS_TO_PDF_LOG_END, - PIXELS_TO_PDF_LOG_START, - IsolationProvider, - terminate_process_group, -) +from .base import IsolationProvider, terminate_process_group TIMEOUT_KILL = 5 # Timeout in seconds until the kill command returns. @@ -234,31 +225,6 @@ def pixels_to_pdf_container_name(self, document: Document) -> str: """Unique container name for the pixels-to-pdf phase.""" return f"dangerzone-pixels-to-pdf-{document.id}" - def assert_field_type(self, val: Any, _type: object) -> None: - # XXX: Use a stricter check than isinstance because `bool` is a subclass of - # `int`. - # - # See https://stackoverflow.com/a/37888668 - if type(val) is not _type: - raise ValueError("Status field has incorrect type") - - def parse_progress_trusted(self, document: Document, line: str) -> None: - """ - Parses a line returned by the container. - """ - try: - status = json.loads(line) - text = status["text"] - self.assert_field_type(text, str) - error = status["error"] - self.assert_field_type(error, bool) - percentage = status["percentage"] - self.assert_field_type(percentage, float) - self.print_progress(document, error, text, percentage) - except Exception: - error_message = f"Invalid JSON returned from container:\n\n\t {line}" - self.print_progress(document, True, error_message, -1) - def exec( self, args: List[str], @@ -337,84 +303,6 @@ def kill_container(self, name: str) -> None: f"Unexpected error occurred while killing container '{name}': {str(e)}" ) - def pixels_to_pdf( - self, document: Document, tempdir: str, ocr_lang: Optional[str] - ) -> None: - # Convert pixels to safe PDF - command = [ - "/usr/bin/python3", - "-m", - "dangerzone.conversion.pixels_to_pdf", - ] - extra_args = [ - "-v", - f"{tempdir}:/safezone:Z", - "-e", - f"OCR={0 if ocr_lang is None else 1}", - "-e", - f"OCR_LANGUAGE={ocr_lang}", - ] - # XXX: Until #748 gets merged, we have to run our pixels to PDF phase in a - # container, which involves mounting two temp dirs. This does not bode well with - # gVisor for two reasons: - # - # 1. Our gVisor integration chroot()s into /home/dangerzone/dangerzone-image/rootfs, - # meaning that the location of the temp dirs must be relevant to that path. - # 2. Reading and writing to these temp dirs requires permissions which are not - # available to the user within gVisor's user namespace. - # - # For these reasons, and because the pixels to PDF phase is more trusted (and - # will soon stop being containerized), we circumvent gVisor support by doing the - # following: - # - # 1. Override our entrypoint script with a no-op command (/usr/bin/env). - # 2. Set the PYTHONPATH so that we can import the Python code within - # /home/dangerzone/dangerzone-image/rootfs - # 3. Run the container as the root user, so that it can always write to the - # mounted directories. This container is trusted, so running as root has no - # impact to the security of Dangerzone. - img_root = "/home/dangerzone/dangerzone-image/rootfs" - extra_args += [ - "--entrypoint", - "/usr/bin/env", - "-e", - f"PYTHONPATH={img_root}/opt/dangerzone:{img_root}/usr/lib/python3.12/site-packages", - "-e", - f"TESSDATA_PREFIX={img_root}/usr/share/tessdata", - "-u", - "root", - ] - - name = self.pixels_to_pdf_container_name(document) - pixels_to_pdf_proc = self.exec_container(command, name, extra_args) - if pixels_to_pdf_proc.stdout: - for line in pixels_to_pdf_proc.stdout: - self.parse_progress_trusted(document, line.decode()) - error_code = pixels_to_pdf_proc.wait() - - # In case of a dev run, log everything from the second container. - if getattr(sys, "dangerzone_dev", False): - assert pixels_to_pdf_proc.stderr - out = pixels_to_pdf_proc.stderr.read().decode() - text = ( - f"Conversion output: (pixels to PDF)\n" - f"{PIXELS_TO_PDF_LOG_START}\n{out}\n{PIXELS_TO_PDF_LOG_END}" - ) - log.info(text) - - if error_code != 0: - log.error("pixels-to-pdf failed") - raise errors.exception_from_error_code(error_code) - else: - # Move the final file to the right place - if os.path.exists(document.output_filename): - os.remove(document.output_filename) - - container_output_filename = os.path.join( - tempdir, "safe-output-compressed.pdf" - ) - shutil.move(container_output_filename, document.output_filename) - def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen: # Convert document to pixels command = [ diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index 34cc9004a..dd1f18169 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -1,20 +1,16 @@ -import asyncio import io import logging import os -import shutil import subprocess import sys import zipfile from pathlib import Path -from typing import IO, Optional +from typing import IO -from ..conversion import errors from ..conversion.common import running_on_qubes -from ..conversion.pixels_to_pdf import PixelsToPDF from ..document import Document from ..util import get_resource_path -from .base import PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, IsolationProvider +from .base import IsolationProvider log = logging.getLogger(__name__) @@ -25,28 +21,6 @@ class Qubes(IsolationProvider): def install(self) -> bool: return True - def pixels_to_pdf( - self, document: Document, tempdir: str, ocr_lang: Optional[str] - ) -> None: - def print_progress_wrapper(error: bool, text: str, percentage: float) -> None: - self.print_progress(document, error, text, percentage) - - converter = PixelsToPDF(progress_callback=print_progress_wrapper) - try: - asyncio.run(converter.convert(ocr_lang, tempdir)) - except (RuntimeError, ValueError) as e: - raise errors.UnexpectedConversionError(str(e)) - finally: - if getattr(sys, "dangerzone_dev", False): - out = converter.captured_output.decode() - text = ( - f"Conversion output: (pixels to PDF)\n" - f"{PIXELS_TO_PDF_LOG_START}\n{out}{PIXELS_TO_PDF_LOG_END}" - ) - log.info(text) - - shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) - def get_max_parallel_conversions(self) -> int: return 1 diff --git a/dangerzone/util.py b/dangerzone/util.py index bbedfdbde..f53ac4bc7 100644 --- a/dangerzone/util.py +++ b/dangerzone/util.py @@ -4,7 +4,6 @@ import subprocess import sys import unicodedata -from typing import Optional import appdirs @@ -13,17 +12,6 @@ def get_config_dir() -> str: return appdirs.user_config_dir("dangerzone") -def get_tmp_dir() -> Optional[str]: - """Get the parent dir for the Dangerzone temporary dirs. - - This function returns the parent directory where Dangerzone will store its temporary - directories. The default behavior is to let Python choose for us (e.g., in `/tmp` - for Linux), which is why we return None. However, we still need to define this - function in order to be able to set this dir via mocking in our tests. - """ - return None - - def get_resource_path(filename: str) -> str: if getattr(sys, "dangerzone_dev", False): # Look for resources directory relative to python file diff --git a/tests/test_cli.py b/tests/test_cli.py index f79a8cf93..21f934f90 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -134,29 +134,19 @@ def run_cli( if os.environ.get("DUMMY_CONVERSION", False): args = ("--unsafe-dummy-conversion", *args) - with tempfile.TemporaryDirectory() as t: - tmp_dir = Path(t) - # TODO: Replace this with `contextlib.chdir()` [1], which was added in - # Python 3.11. - # - # [1]: https://docs.python.org/3/library/contextlib.html#contextlib.chdir - try: - if tmp_path is not None: - cwd = os.getcwd() - os.chdir(tmp_path) - - with mock.patch( - "dangerzone.isolation_provider.container.get_tmp_dir", - return_value=t, - ): - result = CliRunner().invoke(cli_main, args) - finally: - if tmp_path is not None: - os.chdir(cwd) - - if tmp_dir.exists(): - stale_files = list(tmp_dir.iterdir()) - assert not stale_files + # TODO: Replace this with `contextlib.chdir()` [1], which was added in + # Python 3.11. + # + # [1]: https://docs.python.org/3/library/contextlib.html#contextlib.chdir + try: + if tmp_path is not None: + cwd = os.getcwd() + os.chdir(tmp_path) + + result = CliRunner().invoke(cli_main, args) + finally: + if tmp_path is not None: + os.chdir(cwd) # XXX Print stdout so that junitXML exports with output capturing # actually include the stdout + stderr (they are combined into stdout)