Skip to content

Commit

Permalink
refactor: translate stream
Browse files Browse the repository at this point in the history
  • Loading branch information
Byaidu committed Dec 12, 2024
1 parent f1c1965 commit 8ebaaa9
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 132 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ For docker deployment on cloud service:

<h2 id="usage">Advanced Options</h2>

Execute the translation command in the command line to generate the translated document `example-zh.pdf` and the bilingual document `example-dual.pdf` in the current working directory. Use Google as the default translation service.
Execute the translation command in the command line to generate the translated document `example-mono.pdf` and the bilingual document `example-dual.pdf` in the current working directory. Use Google as the default translation service.

<img src="./docs/images/cmd.explained.png" width="580px" alt="cmd"/>

Expand Down
2 changes: 1 addition & 1 deletion README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@

<h2 id="usage">高级选项</h2>

在命令行中执行翻译命令,在当前工作目录下生成译文文档 `example-zh.pdf` 和双语对照文档 `example-dual.pdf`,默认使用 Google 翻译服务
在命令行中执行翻译命令,在当前工作目录下生成译文文档 `example-mono.pdf` 和双语对照文档 `example-dual.pdf`,默认使用 Google 翻译服务

<img src="./docs/images/cmd.explained.png" width="580px" alt="cmd"/>

Expand Down
2 changes: 2 additions & 0 deletions pdf2zh/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
from pdf2zh.high_level import translate, translate_stream

log = logging.getLogger(__name__)

__version__ = "1.8.7"
__author__ = "Byaidu"
__all__ = ["translate", "translate_stream"]
12 changes: 6 additions & 6 deletions pdf2zh/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ def translate_file(
)

filename = os.path.splitext(os.path.basename(file_path))[0]
file_en = output / f"{filename}.pdf"
file_zh = output / f"{filename}-zh.pdf"
file_raw = output / f"{filename}.pdf"
file_mono = output / f"{filename}-mono.pdf"
file_dual = output / f"{filename}-dual.pdf"

translator = service_map[service]
Expand All @@ -164,7 +164,7 @@ def progress_bar(t: tqdm.tqdm):
progress(t.n / t.total, desc="Translating...")

param = {
"files": [file_en],
"files": [file_raw],
"pages": selected_page,
"lang_in": lang_from,
"lang_out": lang_to,
Expand All @@ -177,18 +177,18 @@ def progress_bar(t: tqdm.tqdm):
translate(**param)
print(f"Files after translation: {os.listdir(output)}")

if not file_zh.exists() or not file_dual.exists():
if not file_mono.exists() or not file_dual.exists():
raise gr.Error("No output")

try:
translated_preview = pdf_preview(str(file_zh))
translated_preview = pdf_preview(str(file_mono))
except Exception:
raise gr.Error("No preview")

progress(1.0, desc="Translation complete!")

return (
str(file_zh),
str(file_mono),
translated_preview,
str(file_dual),
gr.update(visible=True),
Expand Down
166 changes: 92 additions & 74 deletions pdf2zh/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pdf2zh.pdfinterp import PDFPageInterpreterEx
from pdf2zh.doclayout import DocLayoutModel
from pathlib import Path
from typing import Any, Container, Iterable, List, Optional
from typing import Any, Iterable, List
import urllib.request
import requests
import tempfile
Expand Down Expand Up @@ -75,7 +75,6 @@ def translate_patch(
inf: BinaryIO,
pages=None,
password: str = "",
page_count: int = 0,
vfont: str = "",
vchar: str = "",
thread: int = 0,
Expand All @@ -86,7 +85,7 @@ def translate_patch(
resfont: str = "",
noto: Font = None,
callback: object = None,
**kwarg,
**kwarg: Any,
) -> None:
rsrcmgr = PDFResourceManager()
layout = {}
Expand All @@ -100,7 +99,7 @@ def translate_patch(
if pages:
total_pages = len(pages)
else:
total_pages = page_count
total_pages = doc_zh.page_count

parser = PDFParser(inf)
doc = PDFDocument(parser, password=password)
Expand Down Expand Up @@ -153,9 +152,89 @@ def translate_patch(
return obj_patch


def translate_stream(
stream,
pages=None,
password: str = "",
vfont: str = "",
vchar: str = "",
thread: int = 0,
doc_zh: Document = None,
lang_in: str = "",
lang_out: str = "",
service: str = "",
callback: object = None,
**kwarg: Any,
):
font_list = [("tiro", None)]
noto = None
if lang_out.lower() in resfont_map: # CJK
resfont = resfont_map[lang_out.lower()]
font_list.append((resfont, None))
elif lang_out.lower() in noto_list: # noto
resfont = "noto"
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
if not os.path.exists(ttf_path):
print("Downloading Noto font...")
urllib.request.urlretrieve(
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
ttf_path,
)
font_list.append(("noto", ttf_path))
noto = Font("noto", ttf_path)
else: # fallback
resfont = "china-ss"
font_list.append(("china-ss", None))

doc_en = Document(stream=stream)
if doc_en.is_encrypted:
doc_en.authenticate(password)
doc_zh = Document(stream=stream)
page_count = doc_zh.page_count
# font_list = [("china-ss", None), ("tiro", None)]
font_id = {}
for page in doc_zh:
for font in font_list:
font_id[font[0]] = page.insert_font(font[0], font[1])
xreflen = doc_zh.xref_length()
for xref in range(1, xreflen):
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
try: # xref 读写可能出错
font_res = doc_zh.xref_get_key(xref, f"{label}Font")
if font_res[0] == "dict":
for font in font_list:
font_exist = doc_zh.xref_get_key(xref, f"{label}Font/{font[0]}")
if font_exist[0] == "null":
doc_zh.xref_set_key(
xref,
f"{label}Font/{font[0]}",
f"{font_id[font[0]]} 0 R",
)
except Exception:
pass

fp = io.BytesIO()
doc_zh.save(fp)
obj_patch: dict = translate_patch(fp, **locals())

for obj_id, ops_new in obj_patch.items():
# ops_old=doc_en.xref_stream(obj_id)
# print(obj_id)
# print(ops_old)
# print(ops_new.encode())
doc_zh.update_stream(obj_id, ops_new.encode())

doc_en.insert_file(doc_zh)
for id in range(page_count):
doc_en.move_page(page_count + id, id * 2 + 1)

return doc_zh.write(deflate=1), doc_en.write(deflate=1)


def translate(
files: Iterable[str] = [],
pages: Optional[Container[int]] = None,
output: str = "",
pages=None,
password: str = "",
vfont: str = "",
vchar: str = "",
Expand All @@ -164,8 +243,7 @@ def translate(
lang_out: str = "",
service: str = "",
callback: object = None,
output: str = "",
**kwargs: Any,
**kwarg: Any,
):
if not files:
raise PDFValueError("No files to process.")
Expand Down Expand Up @@ -199,72 +277,12 @@ def translate(
)
filename = os.path.splitext(os.path.basename(file))[0]

font_list = [("tiro", None)]
noto = None
if lang_out.lower() in resfont_map: # CJK
resfont = resfont_map[lang_out.lower()]
font_list.append((resfont, None))
elif lang_out.lower() in noto_list: # noto
resfont = "noto"
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
if not os.path.exists(ttf_path):
print("Downloading Noto font...")
urllib.request.urlretrieve(
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
ttf_path,
)
font_list.append(("noto", ttf_path))
noto = Font("noto", ttf_path)
else: # fallback
resfont = "china-ss"
font_list.append(("china-ss", None))

doc_en = Document(file)
if doc_en.is_encrypted:
doc_en.authenticate(password)
doc_zh = Document(doc_en)
page_count = doc_zh.page_count
# font_list = [("china-ss", None), ("tiro", None)]
font_id = {}
for page in doc_zh:
for font in font_list:
font_id[font[0]] = page.insert_font(font[0], font[1])
xreflen = doc_zh.xref_length()
for xref in range(1, xreflen):
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
try: # xref 读写可能出错
font_res = doc_zh.xref_get_key(xref, f"{label}Font")
if font_res[0] == "dict":
for font in font_list:
font_exist = doc_zh.xref_get_key(
xref, f"{label}Font/{font[0]}"
)
if font_exist[0] == "null":
doc_zh.xref_set_key(
xref,
f"{label}Font/{font[0]}",
f"{font_id[font[0]]} 0 R",
)
except Exception:
pass

fp = io.BytesIO()
doc_zh.save(fp)
obj_patch: dict = translate_patch(fp, **locals())

for obj_id, ops_new in obj_patch.items():
# ops_old=doc_en.xref_stream(obj_id)
# print(obj_id)
# print(ops_old)
# print(ops_new.encode())
doc_zh.update_stream(obj_id, ops_new.encode())

doc_en.insert_file(doc_zh)
for id in range(page_count):
doc_en.move_page(page_count + id, id * 2 + 1)
doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
doc_en.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
doc_zh.close()
doc_en.close()
doc_raw = open(file, "rb")
s_raw = doc_raw.read()
s_mono, s_dual = translate_stream(s_raw, **locals())
doc_mono = open(Path(output) / f"{filename}-mono.pdf", "wb")
doc_dual = open(Path(output) / f"{filename}-dual.pdf", "wb")
doc_mono.write(s_mono)
doc_dual.write(s_dual)

return
4 changes: 2 additions & 2 deletions pdf2zh/pdf2zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
from pdf2zh import __version__, log
from pdf2zh.high_level import translate

logging.basicConfig()


def create_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
Expand Down Expand Up @@ -135,6 +133,8 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:


def main(args: Optional[List[str]] = None) -> int:
logging.basicConfig()

parsed_args = parse_args(args)

if parsed_args.debug:
Expand Down
54 changes: 32 additions & 22 deletions tools/backend.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import os
import tempfile

from flask import Flask, request, send_file
from celery import Celery, Task
from celery.result import AsyncResult
from pathlib import Path
from tasks import translate_task

from pdf2zh import translate_stream
import tqdm

app = Flask("pdf2zh")
app.config.from_mapping(
Expand Down Expand Up @@ -36,28 +32,42 @@ def __call__(self, *args, **kwargs):
celery_app = celery_init_app(app)


@app.task(bind=True)
def translate_task(
stream: bytes,
lang_in: str = "",
lang_out: str = "",
service: str = "",
):
def progress_bar(t: tqdm.tqdm):
self.update_state(state="PROGRESS", meta={"n": t.n, "total": t.total}) # noqa
print(f"Translating {t.n} / {t.total} pages")

doc_mono, doc_dual = translate_stream(
stream,
lang_in=lang_in,
lang_out=lang_out,
service=service,
thread=4,
callback=progress_bar,
)
return doc_mono, doc_dual


@app.route("/api/translate", methods=["POST"])
def create_translate_tasks():
f = request.files["source"]
output_dir = Path(tempfile.mkdtemp())
file_basename = ".".join(f.filename.split(".")[:-1])
if len(file_basename) == 0:
file_basename = "input"
origin_pdf = output_dir / f"{file_basename}.pdf"
f.save(origin_pdf)
lang_in = request.args.get("lang_in", "auto")
stream = request.files["file"]
lang_in = request.args.get("lang_in", "en")
lang_out = request.args.get("lang_out", "zh")
service = request.args.get("service", "google")
task = translate_task.delay(
str(output_dir), file_basename, lang_in, lang_out, service
)
return {"result_id": task.id}
task = translate_task.delay(stream, lang_in, lang_out, service)
return {"id": task.id}


@app.route("/api/results/<id>", methods=["GET"])
def check_translate_result(id: str):
result = AsyncResult(id)
return {"ready": result.ready(), "successful": result.successful()}
result = celery_app.AsyncResult(id)
return {"state": result.state, "info": result.info}


@app.route("/api/results/<id>/<format>")
Expand All @@ -67,8 +77,8 @@ def get_translate_result(id: str, format: str):
return {"error": "task not finished"}, 400
if not result.successful():
return {"error": "task failed"}, 400
translated_pdf, dual_pdf = result.get()
to_send = translated_pdf if format == "translated" else dual_pdf
doc_mono, doc_dual = result.get()
to_send = doc_mono if format == "mono" else doc_dual
return send_file(to_send, "application/pdf")


Expand Down
Loading

0 comments on commit 8ebaaa9

Please sign in to comment.