Remove dead code

freedomofpress · Oct 8, 2024 · 1ab3aab · 1ab3aab
1 parent 62c3267
commit 1ab3aab
Show file tree

Hide file tree

Showing 8 changed files with 17 additions and 449 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -21,30 +21,6 @@ RUN case "$ARCH" in \
 RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt
 
 
-###########################################
-# Download Tesseract data
-
-FROM alpine:latest as tessdata-dl
-ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
-
-# Download the trained models from the latest GitHub release of Tesseract, and
-# store them under /usr/share/tessdata. This is basically what distro packages
-# do under the hood.
-#
-# Because the GitHub release contains more files than just the trained models,
-# we use `find` to fetch only the '*.traineddata' files in the top directory.
-#
-# Before we untar the models, we also check if the checksum is the expected one.
-RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \
-    && TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
-        | sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
-    && wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
-    && echo "$TESSDATA_CHECKSUM  tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
-    && tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
-    && find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \
-    && cd .. && rm -r tessdata
-
-
 ###########################################
 # Download H2ORestart
 FROM alpine:latest as h2orestart-dl
@@ -74,7 +50,6 @@ RUN apk --no-cache -U upgrade && \
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf
 COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs
-COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata
 COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext
 
 RUN install -dm777 "/usr/lib/libreoffice/share/extensions/"

diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py
@@ -13,15 +13,6 @@ def running_on_qubes() -> bool:
     return os.path.exists("/usr/share/qubes/marker-vm")
 
 
-def get_tessdata_dir() -> str:
-    if os.environ.get("TESSDATA_PREFIX"):
-        return os.environ["TESSDATA_PREFIX"]
-    elif running_on_qubes():
-        return "/usr/share/tesseract/tessdata/"
-    else:
-        return "/usr/share/tessdata/"
-
-
 class DangerzoneConverter:
     def __init__(self, progress_callback: Optional[Callable] = None) -> None:
         self.percentage: float = 0.0

diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py
diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py
@@ -221,12 +221,6 @@ def _convert(
         text = "Converted document"
         self.print_progress(document, False, text, percentage)
 
-    @abstractmethod
-    def pixels_to_pdf(
-        self, document: Document, tempdir: str, ocr_lang: Optional[str]
-    ) -> None:
-        pass
-
     def print_progress(
         self, document: Document, error: bool, text: str, percentage: float
     ) -> None:
@@ -353,74 +347,3 @@ def doc_to_pixels_proc(
                     f"{debug_log}"  # no need for an extra newline here
                     f"{DOC_TO_PIXELS_LOG_END}"
                 )
-
-# From global_common:
-
-# def validate_convert_to_pixel_output(self, common, output):
-#     """
-#     Take the output from the convert to pixels tasks and validate it. Returns
-#     a tuple like: (success (boolean), error_message (str))
-#     """
-#     max_image_width = 10000
-#     max_image_height = 10000
-
-#     # Did we hit an error?
-#     for line in output.split("\n"):
-#         if (
-#             "failed:" in line
-#             or "The document format is not supported" in line
-#             or "Error" in line
-#         ):
-#             return False, output
-
-#     # How many pages was that?
-#     num_pages = None
-#     for line in output.split("\n"):
-#         if line.startswith("Document has "):
-#             num_pages = line.split(" ")[2]
-#             break
-#     if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0:
-#         return False, "Invalid number of pages returned"
-#     num_pages = int(num_pages)
-
-#     # Make sure we have the files we expect
-#     expected_filenames = []
-#     for i in range(1, num_pages + 1):
-#         expected_filenames += [
-#             f"page-{i}.rgb",
-#             f"page-{i}.width",
-#             f"page-{i}.height",
-#         ]
-#     expected_filenames.sort()
-#     actual_filenames = os.listdir(common.pixel_dir.name)
-#     actual_filenames.sort()
-
-#     if expected_filenames != actual_filenames:
-#         return (
-#             False,
-#             f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}",
-#         )
-
-#     # Make sure the files are the correct sizes
-#     for i in range(1, num_pages + 1):
-#         with open(f"{common.pixel_dir.name}/page-{i}.width") as f:
-#             w_str = f.read().strip()
-#         with open(f"{common.pixel_dir.name}/page-{i}.height") as f:
-#             h_str = f.read().strip()
-#         w = int(w_str)
-#         h = int(h_str)
-#         if (
-#             not w_str.isdigit()
-#             or not h_str.isdigit()
-#             or w <= 0
-#             or w > max_image_width
-#             or h <= 0
-#             or h > max_image_height
-#         ):
-#             return False, f"Page {i} has invalid geometry"
-
-#         # Make sure the RGB file is the correct size
-#         if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3:
-#             return False, f"Page {i} has an invalid RGB file size"
-
-#     return True, True