Skip to content

Commit

Permalink
Merge pull request Byaidu#337 from timelic/dev/SourceHanSerif
Browse files Browse the repository at this point in the history
  • Loading branch information
Byaidu authored Jan 7, 2025
2 parents 592ec8a + 7ed13c2 commit d22bbc6
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 46 deletions.
11 changes: 9 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@ WORKDIR /app
EXPOSE 7860

ENV PYTHONUNBUFFERED=1
ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app

# Download all required fonts
ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app/
ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifCN-Regular.ttf" /app/
ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifTW-Regular.ttf" /app/
ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifJP-Regular.ttf" /app/
ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifKR-Regular.ttf" /app/

RUN apt-get update && \
apt-get install --no-install-recommends -y libgl1 && \
rm -rf /var/lib/apt/lists/* && uv pip install --system --no-cache huggingface-hub && \
Expand All @@ -16,4 +23,4 @@ COPY . .

RUN uv pip install --system --no-cache .

CMD ["pdf2zh", "-i"]
CMD ["pdf2zh", "-i"]
10 changes: 5 additions & 5 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def __init__(
lang_in: str = "",
lang_out: str = "",
service: str = "",
resfont: str = "",
noto_name: str = "",
noto: Font = None,
envs: Dict = None,
prompt: List = None,
Expand All @@ -148,7 +148,7 @@ def __init__(
self.vchar = vchar
self.thread = thread
self.layout = layout
self.resfont = resfont
self.noto_name = noto_name
self.noto = noto
self.translator: BaseTranslator = None
param = service.split(":", 1)
Expand Down Expand Up @@ -359,7 +359,7 @@ def worker(s: str): # 多线程翻译
############################################################
# C. 新文档排版
def raw_string(fcur: str, cstk: str): # 编码字符串
if fcur == 'noto':
if fcur == self.noto_name:
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
return "".join(["%04x" % ord(c) for c in cstk])
Expand Down Expand Up @@ -403,8 +403,8 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
except Exception:
pass
if fcur_ is None:
fcur_ = self.resfont # 默认非拉丁字体
if fcur_ == 'noto':
fcur_ = self.noto_name # 默认非拉丁字体
if fcur_ == self.noto_name: # FIXME: change to CONST
adv = self.noto.char_lengths(ch, size)[0]
else:
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
Expand Down
83 changes: 44 additions & 39 deletions pdf2zh/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,7 @@
from pdf2zh.doclayout import OnnxModel
from pdf2zh.pdfinterp import PDFPageInterpreterEx

resfont_map = {
"zh-cn": "china-ss",
"zh-tw": "china-ts",
"zh-hans": "china-ss",
"zh-hant": "china-ts",
"zh": "china-ss",
"ja": "japan-s",
"ko": "korea-s",
}
NOTO_NAME = "noto"

noto_list = [
"am", # Amharic
Expand All @@ -44,18 +36,14 @@
"gu", # Gujarati
"iw", # Hebrew
"hi", # Hindi
# "ja", # Japanese
"kn", # Kannada
# "ko", # Korean
"ml", # Malayalam
"mr", # Marathi
"ru", # Russian
"sr", # Serbian
# "zh-cn",# SC
"ta", # Tamil
"te", # Telugu
"th", # Thai
# "zh-tw",# TC
"ur", # Urdu
"uk", # Ukrainian
]
Expand All @@ -82,7 +70,7 @@ def translate_patch(
lang_in: str = "",
lang_out: str = "",
service: str = "",
resfont: str = "",
noto_name: str = "",
noto: Font = None,
callback: object = None,
cancellation_event: asyncio.Event = None,
Expand All @@ -102,7 +90,7 @@ def translate_patch(
lang_in,
lang_out,
service,
resfont,
noto_name,
noto,
envs,
prompt,
Expand Down Expand Up @@ -186,35 +174,18 @@ def translate_stream(
**kwarg: Any,
):
font_list = [("tiro", None)]
noto = None
if lang_out.lower() in resfont_map: # CJK
resfont = resfont_map[lang_out.lower()]
font_list.append((resfont, None))
elif lang_out.lower() in noto_list: # noto
resfont = "noto"
# docker
ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")

if not os.path.exists(ttf_path):
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
if not os.path.exists(ttf_path):
print("Downloading Noto font...")
urllib.request.urlretrieve(
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
ttf_path,
)
font_list.append(("noto", ttf_path))
noto = Font("noto", ttf_path)
else: # fallback
resfont = "china-ss"
font_list.append(("china-ss", None))

font_path = download_remote_fonts(lang_out.lower())
noto_name = NOTO_NAME
noto = Font(noto_name, font_path)
font_list.append((noto_name, font_path))

doc_en = Document(stream=stream)
stream = io.BytesIO()
doc_en.save(stream)
doc_zh = Document(stream=stream)
page_count = doc_zh.page_count
# font_list = [("china-ss", None), ("tiro", None)]
# font_list = [("GoNotoKurrent-Regular.ttf", font_path), ("tiro", None)]
font_id = {}
for page in doc_zh:
for font in font_list:
Expand All @@ -237,6 +208,7 @@ def translate_stream(
pass

fp = io.BytesIO()

doc_zh.save(fp)
obj_patch: dict = translate_patch(fp, **locals())

Expand All @@ -251,7 +223,12 @@ def translate_stream(
for id in range(page_count):
doc_en.move_page(page_count + id, id * 2 + 1)

return doc_zh.write(deflate=1), doc_en.write(deflate=1)
doc_zh.subset_fonts(fallback=True)
doc_en.subset_fonts(fallback=True)
return (
doc_zh.write(deflate=True, garbage=3, use_objstms=1),
doc_en.write(deflate=True, garbage=3, use_objstms=1),
)


def convert_to_pdfa(input_path, output_path):
Expand Down Expand Up @@ -386,3 +363,31 @@ def translate(
result_files.append((str(file_mono), str(file_dual)))

return result_files


def download_remote_fonts(lang: str):
URL_PREFIX = "https://github.com/timelic/source-han-serif/releases/download/main/"
LANG_NAME_MAP = {
**{la: "GoNotoKurrent-Regular.ttf" for la in noto_list},
**{
la: f"SourceHanSerif{region}-Regular.ttf"
for region, langs in {
"CN": ["zh-cn", "zh-hans", "zh"],
"TW": ["zh-tw", "zh-hant"],
"JP": ["ja"],
"KR": ["ko"],
}.items()
for la in langs
},
}
font_name = LANG_NAME_MAP.get(lang, "GoNotoKurrent-Regular.ttf")

# docker
font_path = os.environ.get("NOTO_FONT_PATH", Path("/app", font_name).as_posix())
if not Path(font_path).exists():
font_path = Path(tempfile.gettempdir(), font_name).as_posix()
if not Path(font_path).exists():
print(f"Downloading {font_name}...")
urllib.request.urlretrieve(f"{URL_PREFIX}{font_name}", font_path)

return font_path
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies = [
"pikepdf",
"peewee>=3.17.8",
"argostranslate",
"fontTools"
]

[project.optional-dependencies]
Expand Down

0 comments on commit d22bbc6

Please sign in to comment.