Skip to content

Commit

Permalink
feat (cli): add support for non-pdf/a document
Browse files Browse the repository at this point in the history
  • Loading branch information
reycn committed Dec 19, 2024
1 parent 2592343 commit 6b293ab
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 21 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ Feel free to provide feedback in [GitHub Issues](https://github.com/Byaidu/PDFMa

<h2 id="updates">Updates</h2>

- [Nov. 26 2024] CLI now supports online file(s) _(by [@reycn](https://github.com/reycn))_
- [Nov. 24 2024] [ONNX](https://github.com/onnx/onnx) support to reduce dependency sizes _(by [@Wybxc](https://github.com/Wybxc))_
- [Nov. 23 2024] 🌟 [Public Service](#demo) online! _(by [@Byaidu](https://github.com/Byaidu))_
- [Dec. 19 2024] Non-PDF/A documents are now supported using `-cp` _(by [@reycn](https://github.com/reycn))_
- [Dec. 13 2024] Additional support for backend by _(by [@YadominJinta](https://github.com/YadominJinta))_
- [Dec. 10 2024] The translator now supports OpenAI models on Azure _(by [@yidasanqian](https://github.com/yidasanqian))_

<h2 id="preview">Preview</h2>

Expand Down Expand Up @@ -184,6 +184,7 @@ In the following table, we list all advanced options for reference:
| `-t` | [Multi-threads](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#threads) | `pdf2zh example.pdf -t 1` |
| `-o` | Output dir | `pdf2zh example.pdf -o output` |
| `-f`, `-c` | [Exceptions](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` |
| `-cp` | Compatibility Mode | `pdf2zh example.pdf --compatible` |
| `--share` | Public link | `pdf2zh -i --share` |
| `--authorized` | Authorization | `pdf2zh -i --authorized users.txt [auth.html]` |
| `--prompt` | [Custom Prompt](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#prompt) | `pdf2zh --prompt [prompt.txt]` |
Expand Down
93 changes: 77 additions & 16 deletions pdf2zh/high_level.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
"""Functions that can be used for the most common use-cases for pdf2zh.six"""

import asyncio
import io
import os
import sys
import tempfile
import urllib.request
from asyncio import CancelledError
from typing import BinaryIO
from pathlib import Path
from typing import Any, BinaryIO, List, Optional

import numpy as np
import requests
import tqdm
import sys
from pymupdf import Font, Document
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfexceptions import PDFValueError
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pymupdf import Document, Font

from pdf2zh.converter import TranslateConverter
from pdf2zh.pdfinterp import PDFPageInterpreterEx
from pdf2zh.doclayout import DocLayoutModel
from pathlib import Path
from typing import Any, List, Optional
import urllib.request
import requests
import tempfile
import os
import io
from pdf2zh.pdfinterp import PDFPageInterpreterEx

model = DocLayoutModel.load_available()

Expand Down Expand Up @@ -136,7 +137,7 @@ def translate_patch(
h, w = box.shape
vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
for i, d in enumerate(page_layout.boxes):
if not page_layout.names[int(d.cls)] in vcls:
if page_layout.names[int(d.cls)] not in vcls:
x0, y0, x1, y1 = d.xyxy.squeeze()
x0, y0, x1, y1 = (
np.clip(int(x0 - 1), 0, w - 1),
Expand Down Expand Up @@ -246,6 +247,56 @@ def translate_stream(
return doc_zh.write(deflate=1), doc_en.write(deflate=1)


def convert_to_pdfa(input_path, output_path):
"""
Convert PDF to PDF/A format
Args:
input_path: Path to source PDF file
output_path: Path to save PDF/A file
"""
import pikepdf
from pikepdf import Dictionary, Name, Pdf

# Open the PDF file
pdf = Pdf.open(input_path)

# Add PDF/A conformance metadata
metadata = {
"pdfa_part": "2",
"pdfa_conformance": "B",
"title": pdf.docinfo.get("/Title", ""),
"author": pdf.docinfo.get("/Author", ""),
"creator": "PDF Math Translate",
}

with pdf.open_metadata() as meta:
meta.load_from_docinfo(pdf.docinfo)
meta["pdfaid:part"] = metadata["pdfa_part"]
meta["pdfaid:conformance"] = metadata["pdfa_conformance"]

# Create OutputIntent dictionary
output_intent = Dictionary(
{
"/Type": Name("/OutputIntent"),
"/S": Name("/GTS_PDFA1"),
"/OutputConditionIdentifier": "sRGB IEC61966-2.1",
"/RegistryName": "http://www.color.org",
"/Info": "sRGB IEC61966-2.1",
}
)

# Add output intent to PDF root
if "/OutputIntents" not in pdf.Root:
pdf.Root.OutputIntents = [output_intent]
else:
pdf.Root.OutputIntents.append(output_intent)

# Save as PDF/A
pdf.save(output_path, linearize=True)
pdf.close()


def translate(
files: list[str],
output: str = "",
Expand All @@ -257,6 +308,7 @@ def translate(
vfont: str = "",
vchar: str = "",
callback: object = None,
compatible: bool = False,
cancellation_event: asyncio.Event = None,
**kwarg: Any,
):
Expand Down Expand Up @@ -294,7 +346,15 @@ def translate(
)
filename = os.path.splitext(os.path.basename(file))[0]

doc_raw = open(file, "rb")
# If the commandline has specified converting to PDF/A format
## --compatible / -cp
if compatible:
file_pdfa = file.replace(".pdf", "-pdfa.pdf")
print(f"Converting {file} to PDF/A format...")
convert_to_pdfa(file, file_pdfa)
doc_raw = open(file_pdfa, "rb")
else:
doc_raw = open(file, "rb")
s_raw = doc_raw.read()
s_mono, s_dual = translate_stream(
s_raw,
Expand All @@ -311,3 +371,4 @@ def translate(
result_files.append((str(file_mono), str(file_dual)))

return result_files
return result_files
12 changes: 10 additions & 2 deletions pdf2zh/pdf2zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
from __future__ import annotations

import argparse
import sys
import logging
import sys
from string import Template
from typing import List, Optional

from pdf2zh import __version__, log
from pdf2zh.high_level import translate
from string import Template


def create_parser() -> argparse.ArgumentParser:
Expand Down Expand Up @@ -128,6 +129,13 @@ def create_parser() -> argparse.ArgumentParser:
help="user custom prompt.",
)

parse_params.add_argument(
"--compatible",
"-cp",
action="store_true",
help="Convert the PDF file into PDF/A format to improve compatibility.",
)

return parser


Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ dependencies = [
"tencentcloud-sdk-python",
"pdfminer.six>=20240706",
"gradio_pdf",
"pikepdf",
]

[project.optional-dependencies]
Expand Down
Binary file added test/file/translate.cli.font.unknown.pdf
Binary file not shown.
Binary file modified test/file/translate.cli.plain.text.pdf
Binary file not shown.
Binary file added test/file/translate.cli.text.with.figure.pdf
Binary file not shown.

0 comments on commit 6b293ab

Please sign in to comment.