From 3f86e7b4654e0aea458b54248288eb9e2b1e86f8 Mon Sep 17 00:00:00 2001
From: Alex Pyrgiotis <alex.p@freedom.press>
Date: Thu, 25 Jul 2024 12:41:23 +0300
Subject: [PATCH] Make PyMuPDF always log to stderr

PyMUPDF logs to stdout by default, which is problematic because we use
the stdout of the conversion process to read the pixel stream of a
document.

Make PyMuPDF always log to stderr, by setting the following environment
variables: PYMUPDF_MESSAGE and PYMUPDF_LOG.

Fixes #877
---
 dangerzone/conversion/doc_to_pixels.py | 11 +++++++++++
 dangerzone/conversion/pixels_to_pdf.py | 25 +++++++++++++++++--------
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py
index 4780e37cf..673760701 100644
--- a/dangerzone/conversion/doc_to_pixels.py
+++ b/dangerzone/conversion/doc_to_pixels.py
@@ -3,6 +3,17 @@
 import sys
 from typing import Dict, Optional
 
+# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to
+# stderr, but it's based on environment variables. These envvars are consulted at import
+# time [3], so we have to set them here, before we import `fitz`.
+#
+# [1] https://github.com/freedomofpress/dangerzone/issues/877
+# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724
+# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63
+os.environ["PYMUPDF_MESSAGE"] = "fd:2"
+os.environ["PYMUPDF_LOG"] = "fd:2"
+
+
 import fitz
 import magic
 
diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py
index 55271c30d..a5a5ba82d 100644
--- a/dangerzone/conversion/pixels_to_pdf.py
+++ b/dangerzone/conversion/pixels_to_pdf.py
@@ -16,6 +16,16 @@
 
 from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes
 
+# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to
+# stderr, but it's based on environment variables. These envvars are consulted at import
+# time [3], so we have to set them here, before we import `fitz`.
+#
+# [1] https://github.com/freedomofpress/dangerzone/issues/877
+# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724
+# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63
+os.environ["PYMUPDF_MESSAGE"] = "fd:2"
+os.environ["PYMUPDF_LOG"] = "fd:2"
+
 
 class PixelsToPDF(DangerzoneConverter):
     async def convert(
@@ -50,14 +60,13 @@ async def convert(
             # The first few operations happen on a per-page basis.
             page_size = len(untrusted_rgb_data)
             total_size += page_size
-            with contextlib.redirect_stdout(io.StringIO()):
-                pixmap = fitz.Pixmap(
-                    fitz.Colorspace(fitz.CS_RGB),
-                    width,
-                    height,
-                    untrusted_rgb_data,
-                    False,
-                )
+            pixmap = fitz.Pixmap(
+                fitz.Colorspace(fitz.CS_RGB),
+                width,
+                height,
+                untrusted_rgb_data,
+                False,
+            )
             pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
             if ocr_lang:  # OCR the document
                 self.update_progress(