From 3f86e7b4654e0aea458b54248288eb9e2b1e86f8 Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Thu, 25 Jul 2024 12:41:23 +0300 Subject: [PATCH] Make PyMuPDF always log to stderr PyMUPDF logs to stdout by default, which is problematic because we use the stdout of the conversion process to read the pixel stream of a document. Make PyMuPDF always log to stderr, by setting the following environment variables: PYMUPDF_MESSAGE and PYMUPDF_LOG. Fixes #877 --- dangerzone/conversion/doc_to_pixels.py | 11 +++++++++++ dangerzone/conversion/pixels_to_pdf.py | 25 +++++++++++++++++-------- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index 4780e37cf..673760701 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -3,6 +3,17 @@ import sys from typing import Dict, Optional +# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to +# stderr, but it's based on environment variables. These envvars are consulted at import +# time [3], so we have to set them here, before we import `fitz`. +# +# [1] https://github.com/freedomofpress/dangerzone/issues/877 +# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724 +# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63 +os.environ["PYMUPDF_MESSAGE"] = "fd:2" +os.environ["PYMUPDF_LOG"] = "fd:2" + + import fitz import magic diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 55271c30d..a5a5ba82d 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -16,6 +16,16 @@ from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes +# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to +# stderr, but it's based on environment variables. These envvars are consulted at import +# time [3], so we have to set them here, before we import `fitz`. +# +# [1] https://github.com/freedomofpress/dangerzone/issues/877 +# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724 +# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63 +os.environ["PYMUPDF_MESSAGE"] = "fd:2" +os.environ["PYMUPDF_LOG"] = "fd:2" + class PixelsToPDF(DangerzoneConverter): async def convert( @@ -50,14 +60,13 @@ async def convert( # The first few operations happen on a per-page basis. page_size = len(untrusted_rgb_data) total_size += page_size - with contextlib.redirect_stdout(io.StringIO()): - pixmap = fitz.Pixmap( - fitz.Colorspace(fitz.CS_RGB), - width, - height, - untrusted_rgb_data, - False, - ) + pixmap = fitz.Pixmap( + fitz.Colorspace(fitz.CS_RGB), + width, + height, + untrusted_rgb_data, + False, + ) pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI) if ocr_lang: # OCR the document self.update_progress(