feat: support noto

Byaidu · Dec 3, 2024 · 45ec823 · 45ec823
1 parent 2bf0834
commit 45ec823
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 10 deletions.
diff --git a/pdf2zh/converter.py b/pdf2zh/converter.py
@@ -26,6 +26,7 @@
     AzureTranslator,
     TencentTranslator,
 )
+from pymupdf import Font
 
 log = logging.getLogger(__name__)
 
@@ -123,12 +124,16 @@ def __init__(
         lang_in: str = "",
         lang_out: str = "",
         service: str = "",
+        resfont: str = "",
+        noto: Font = None,
     ) -> None:
         super().__init__(rsrcmgr)
         self.vfont = vfont
         self.vchar = vchar
         self.thread = thread
         self.layout = layout
+        self.resfont = resfont
+        self.noto = noto
         self.translator: BaseTranslator = None
         param = service.split(":", 1)
         if param[0] == "google":
@@ -343,7 +348,9 @@ def worker(s: str):  # 多线程翻译
         ############################################################
         # C. 新文档排版
         def raw_string(fcur: str, cstk: str):  # 编码字符串
-            if isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
+            if fcur == 'noto':
+                return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
+            elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
                 return "".join(["%04x" % ord(c) for c in cstk])
             else:
                 return "".join(["%02x" % ord(c) for c in cstk])
@@ -388,13 +395,16 @@ def raw_string(fcur: str, cstk: str):  # 编码字符串
                     #     pass
                     try:
                         if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
-                            fcur_ = "tiro"  # 默认英文字体
+                            fcur_ = "tiro"  # 默认拉丁字体
                     except Exception:
                         pass
                     if fcur_ is None:
-                        fcur_ = "china-ss"  # 默认中文字体
+                        fcur_ = self.resfont  # 默认非拉丁字体
                     # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
-                    adv = self.fontmap[fcur_].char_width(ord(ch)) * size
+                    if fcur_ == 'noto':
+                        adv = self.noto.char_lengths(ch, size)[0]
+                    else:
+                        adv = self.fontmap[fcur_].char_width(ord(ch)) * size
                     ptr += 1
                 if (                                # 输出文字缓冲区
                     fcur_ != fcur                   # 1. 字体更新
@@ -406,7 +416,7 @@ def raw_string(fcur: str, cstk: str):  # 编码字符串
                         cstk = ""
                 if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
                     x = x0
-                    lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2}  # CJK
+                    lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
                     y -= size * lang_space.get(self.translator.lang_out, 1.1)  # 小语种大多适配 1.1
                 if vy_regex:  # 插入公式
                     fix = 0

diff --git a/pdf2zh/high_level.py b/pdf2zh/high_level.py
@@ -10,6 +10,7 @@
 from pdfminer.pdfparser import PDFParser
 from pdf2zh.converter import TranslateConverter
 from pdf2zh.pdfinterp import PDFPageInterpreterEx
+from pymupdf import Font
 
 
 def extract_text_to_fp(
@@ -26,13 +27,15 @@ def extract_text_to_fp(
     lang_in: str = "",
     lang_out: str = "",
     service: str = "",
+    resfont: str = "",
+    noto: Font = None,
     callback: object = None,
     **kwarg,
 ) -> None:
     rsrcmgr = PDFResourceManager()
     layout = {}
     device = TranslateConverter(
-        rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service
+        rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto
     )
 
     assert device is not None

diff --git a/pdf2zh/pdf2zh.py b/pdf2zh/pdf2zh.py
@@ -11,10 +11,12 @@
 import logging
 from pathlib import Path
 from typing import Any, Container, Iterable, List, Optional
+import urllib.request
 from pdfminer.pdfexceptions import PDFValueError
 
 import pymupdf
 import requests
+import tempfile
 
 from pdf2zh import __version__, log
 from pdf2zh.high_level import extract_text_to_fp
@@ -24,6 +26,38 @@
 
 model = DocLayoutModel.load_available()
 
+resfont_map = {
+    "zh-CN": "china-ss",
+    "zh-TW": "china-ts",
+    "ja": "japan-s",
+    "ko": "korea-s",
+}
+noto_list = [
+    "am",  # Amharic
+    "ar",  # Arabic
+    "bn",  # Bengali
+    "bg",  # Bulgarian
+    "chr",  # Cherokee
+    "el",  # Greek
+    "gu",  # Gujarati
+    "iw",  # Hebrew
+    "hi",  # Hindi
+    # "ja",  # Japanese
+    "kn",  # Kannada
+    # "ko",  # Korean
+    "ml",  # Malayalam
+    "mr",  # Marathi
+    "ru",  # Russian
+    "sr",  # Serbian
+    # "zh-CN",# Chinese (PRC)
+    "ta",  # Tamil
+    "te",  # Telugu
+    "th",  # Thai
+    # "zh-TW",# Chinese (Taiwan)
+    "ur",  # Urdu
+    "uk",  # Ukrainian
+]
+
 
 def check_files(files: List[str]) -> List[str]:
     files = [
@@ -78,13 +112,33 @@ def extract_text(
                 )
         filename = os.path.splitext(os.path.basename(file))[0]
 
+        font_list = [("tiro", None)]
+        noto = None
+        if lang_out in resfont_map:  # CJK
+            resfont = resfont_map[lang_out]
+            font_list.append((resfont, None))
+        elif lang_out in noto_list:  # noto
+            resfont = "noto"
+            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
+            if not os.path.exists(ttf_path):
+                print("Downloading Noto font...")
+                urllib.request.urlretrieve(
+                    "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
+                    ttf_path,
+                )
+            font_list.append(("noto", ttf_path))
+            noto = pymupdf.Font("noto", ttf_path)
+        else:  # auto
+            resfont = "china-ss"
+            font_list.append(("china-ss", None))
+
         doc_en = pymupdf.open(file)
         page_count = doc_en.page_count
-        font_list = ["china-ss", "tiro"]
+        # font_list = [("china-ss", None), ("tiro", None)]
         font_id = {}
         for page in doc_en:
             for font in font_list:
-                font_id[font] = page.insert_font(font)
+                font_id[font[0]] = page.insert_font(font[0], font[1])
         xreflen = doc_en.xref_length()
         for xref in range(1, xreflen):
             for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
@@ -93,11 +147,13 @@ def extract_text(
                     if font_res[0] == "dict":
                         for font in font_list:
                             font_exist = doc_en.xref_get_key(
-                                xref, f"{label}Font/{font}"
+                                xref, f"{label}Font/{font[0]}"
                             )
                             if font_exist[0] == "null":
                                 doc_en.xref_set_key(
-                                    xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
+                                    xref,
+                                    f"{label}Font/{font[0]}",
+                                    f"{font_id[font[0]]} 0 R",
                                 )
                 except Exception:
                     pass