Skip to content

Commit

Permalink
feat: support noto
Browse files Browse the repository at this point in the history
  • Loading branch information
Byaidu committed Dec 3, 2024
1 parent 2bf0834 commit 45ec823
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 10 deletions.
20 changes: 15 additions & 5 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
AzureTranslator,
TencentTranslator,
)
from pymupdf import Font

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -123,12 +124,16 @@ def __init__(
lang_in: str = "",
lang_out: str = "",
service: str = "",
resfont: str = "",
noto: Font = None,
) -> None:
super().__init__(rsrcmgr)
self.vfont = vfont
self.vchar = vchar
self.thread = thread
self.layout = layout
self.resfont = resfont
self.noto = noto
self.translator: BaseTranslator = None
param = service.split(":", 1)
if param[0] == "google":
Expand Down Expand Up @@ -343,7 +348,9 @@ def worker(s: str): # 多线程翻译
############################################################
# C. 新文档排版
def raw_string(fcur: str, cstk: str): # 编码字符串
if isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
if fcur == 'noto':
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
return "".join(["%04x" % ord(c) for c in cstk])
else:
return "".join(["%02x" % ord(c) for c in cstk])
Expand Down Expand Up @@ -388,13 +395,16 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
# pass
try:
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
fcur_ = "tiro" # 默认英文字体
fcur_ = "tiro" # 默认拉丁字体
except Exception:
pass
if fcur_ is None:
fcur_ = "china-ss" # 默认中文字体
fcur_ = self.resfont # 默认非拉丁字体
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
if fcur_ == 'noto':
adv = self.noto.char_lengths(ch, size)[0]
else:
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
ptr += 1
if ( # 输出文字缓冲区
fcur_ != fcur # 1. 字体更新
Expand All @@ -406,7 +416,7 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
cstk = ""
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
x = x0
lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2} # CJK
lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1
if vy_regex: # 插入公式
fix = 0
Expand Down
5 changes: 4 additions & 1 deletion pdf2zh/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pdfminer.pdfparser import PDFParser
from pdf2zh.converter import TranslateConverter
from pdf2zh.pdfinterp import PDFPageInterpreterEx
from pymupdf import Font


def extract_text_to_fp(
Expand All @@ -26,13 +27,15 @@ def extract_text_to_fp(
lang_in: str = "",
lang_out: str = "",
service: str = "",
resfont: str = "",
noto: Font = None,
callback: object = None,
**kwarg,
) -> None:
rsrcmgr = PDFResourceManager()
layout = {}
device = TranslateConverter(
rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service
rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto
)

assert device is not None
Expand Down
64 changes: 60 additions & 4 deletions pdf2zh/pdf2zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
import logging
from pathlib import Path
from typing import Any, Container, Iterable, List, Optional
import urllib.request
from pdfminer.pdfexceptions import PDFValueError

import pymupdf
import requests
import tempfile

from pdf2zh import __version__, log
from pdf2zh.high_level import extract_text_to_fp
Expand All @@ -24,6 +26,38 @@

model = DocLayoutModel.load_available()

resfont_map = {
"zh-CN": "china-ss",
"zh-TW": "china-ts",
"ja": "japan-s",
"ko": "korea-s",
}
noto_list = [
"am", # Amharic
"ar", # Arabic
"bn", # Bengali
"bg", # Bulgarian
"chr", # Cherokee
"el", # Greek
"gu", # Gujarati
"iw", # Hebrew
"hi", # Hindi
# "ja", # Japanese
"kn", # Kannada
# "ko", # Korean
"ml", # Malayalam
"mr", # Marathi
"ru", # Russian
"sr", # Serbian
# "zh-CN",# Chinese (PRC)
"ta", # Tamil
"te", # Telugu
"th", # Thai
# "zh-TW",# Chinese (Taiwan)
"ur", # Urdu
"uk", # Ukrainian
]


def check_files(files: List[str]) -> List[str]:
files = [
Expand Down Expand Up @@ -78,13 +112,33 @@ def extract_text(
)
filename = os.path.splitext(os.path.basename(file))[0]

font_list = [("tiro", None)]
noto = None
if lang_out in resfont_map: # CJK
resfont = resfont_map[lang_out]
font_list.append((resfont, None))
elif lang_out in noto_list: # noto
resfont = "noto"
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
if not os.path.exists(ttf_path):
print("Downloading Noto font...")
urllib.request.urlretrieve(
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
ttf_path,
)
font_list.append(("noto", ttf_path))
noto = pymupdf.Font("noto", ttf_path)
else: # auto
resfont = "china-ss"
font_list.append(("china-ss", None))

doc_en = pymupdf.open(file)
page_count = doc_en.page_count
font_list = ["china-ss", "tiro"]
# font_list = [("china-ss", None), ("tiro", None)]
font_id = {}
for page in doc_en:
for font in font_list:
font_id[font] = page.insert_font(font)
font_id[font[0]] = page.insert_font(font[0], font[1])
xreflen = doc_en.xref_length()
for xref in range(1, xreflen):
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
Expand All @@ -93,11 +147,13 @@ def extract_text(
if font_res[0] == "dict":
for font in font_list:
font_exist = doc_en.xref_get_key(
xref, f"{label}Font/{font}"
xref, f"{label}Font/{font[0]}"
)
if font_exist[0] == "null":
doc_en.xref_set_key(
xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
xref,
f"{label}Font/{font[0]}",
f"{font_id[font[0]]} 0 R",
)
except Exception:
pass
Expand Down

0 comments on commit 45ec823

Please sign in to comment.