Skip to content

Commit

Permalink
Remove dead code
Browse files Browse the repository at this point in the history
  • Loading branch information
apyrgio committed Oct 8, 2024
1 parent 62c3267 commit 1ab3aab
Show file tree
Hide file tree
Showing 8 changed files with 17 additions and 449 deletions.
25 changes: 0 additions & 25 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,6 @@ RUN case "$ARCH" in \
RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt


###########################################
# Download Tesseract data

FROM alpine:latest as tessdata-dl
ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9

# Download the trained models from the latest GitHub release of Tesseract, and
# store them under /usr/share/tessdata. This is basically what distro packages
# do under the hood.
#
# Because the GitHub release contains more files than just the trained models,
# we use `find` to fetch only the '*.traineddata' files in the top directory.
#
# Before we untar the models, we also check if the checksum is the expected one.
RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
| sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
&& wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
&& echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
&& tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \
&& cd .. && rm -r tessdata


###########################################
# Download H2ORestart
FROM alpine:latest as h2orestart-dl
Expand Down Expand Up @@ -74,7 +50,6 @@ RUN apk --no-cache -U upgrade && \
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs
COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata
COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext

RUN install -dm777 "/usr/lib/libreoffice/share/extensions/"
Expand Down
9 changes: 0 additions & 9 deletions dangerzone/conversion/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,6 @@ def running_on_qubes() -> bool:
return os.path.exists("/usr/share/qubes/marker-vm")


def get_tessdata_dir() -> str:
if os.environ.get("TESSDATA_PREFIX"):
return os.environ["TESSDATA_PREFIX"]
elif running_on_qubes():
return "/usr/share/tesseract/tessdata/"
else:
return "/usr/share/tessdata/"


class DangerzoneConverter:
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
self.percentage: float = 0.0
Expand Down
161 changes: 0 additions & 161 deletions dangerzone/conversion/pixels_to_pdf.py

This file was deleted.

77 changes: 0 additions & 77 deletions dangerzone/isolation_provider/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,12 +221,6 @@ def _convert(
text = "Converted document"
self.print_progress(document, False, text, percentage)

@abstractmethod
def pixels_to_pdf(
self, document: Document, tempdir: str, ocr_lang: Optional[str]
) -> None:
pass

def print_progress(
self, document: Document, error: bool, text: str, percentage: float
) -> None:
Expand Down Expand Up @@ -353,74 +347,3 @@ def doc_to_pixels_proc(
f"{debug_log}" # no need for an extra newline here
f"{DOC_TO_PIXELS_LOG_END}"
)

# From global_common:

# def validate_convert_to_pixel_output(self, common, output):
# """
# Take the output from the convert to pixels tasks and validate it. Returns
# a tuple like: (success (boolean), error_message (str))
# """
# max_image_width = 10000
# max_image_height = 10000

# # Did we hit an error?
# for line in output.split("\n"):
# if (
# "failed:" in line
# or "The document format is not supported" in line
# or "Error" in line
# ):
# return False, output

# # How many pages was that?
# num_pages = None
# for line in output.split("\n"):
# if line.startswith("Document has "):
# num_pages = line.split(" ")[2]
# break
# if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0:
# return False, "Invalid number of pages returned"
# num_pages = int(num_pages)

# # Make sure we have the files we expect
# expected_filenames = []
# for i in range(1, num_pages + 1):
# expected_filenames += [
# f"page-{i}.rgb",
# f"page-{i}.width",
# f"page-{i}.height",
# ]
# expected_filenames.sort()
# actual_filenames = os.listdir(common.pixel_dir.name)
# actual_filenames.sort()

# if expected_filenames != actual_filenames:
# return (
# False,
# f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}",
# )

# # Make sure the files are the correct sizes
# for i in range(1, num_pages + 1):
# with open(f"{common.pixel_dir.name}/page-{i}.width") as f:
# w_str = f.read().strip()
# with open(f"{common.pixel_dir.name}/page-{i}.height") as f:
# h_str = f.read().strip()
# w = int(w_str)
# h = int(h_str)
# if (
# not w_str.isdigit()
# or not h_str.isdigit()
# or w <= 0
# or w > max_image_width
# or h <= 0
# or h > max_image_height
# ):
# return False, f"Page {i} has invalid geometry"

# # Make sure the RGB file is the correct size
# if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3:
# return False, f"Page {i} has an invalid RGB file size"

# return True, True
Loading

0 comments on commit 1ab3aab

Please sign in to comment.