Skip to content

Commit

Permalink
fix: zh codes
Browse files Browse the repository at this point in the history
  • Loading branch information
Byaidu committed Dec 6, 2024
1 parent 4e9a99d commit 6981951
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 33 deletions.
4 changes: 2 additions & 2 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,8 +414,8 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
cstk = ""
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
x = x0
lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1
lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1) # 小语种大多适配 1.1
if vy_regex: # 插入公式
fix = 0
if fcur is not None: # 段落内公式修正纵向偏移
Expand Down
23 changes: 13 additions & 10 deletions pdf2zh/pdf2zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@
model = DocLayoutModel.load_available()

resfont_map = {
"zh-CN": "china-ss",
"zh-TW": "china-ts",
"zh-cn": "china-ss",
"zh-tw": "china-ts",
"zh-hans": "china-ss",
"zh-hant": "china-ts",
"zh": "china-ss",
"ja": "japan-s",
"ko": "korea-s",
}
Expand All @@ -49,11 +52,11 @@
"mr", # Marathi
"ru", # Russian
"sr", # Serbian
# "zh-CN",# Chinese (PRC)
# "zh-cn",# SC
"ta", # Tamil
"te", # Telugu
"th", # Thai
# "zh-TW",# Chinese (Taiwan)
# "zh-tw",# TC
"ur", # Urdu
"uk", # Ukrainian
]
Expand Down Expand Up @@ -114,10 +117,10 @@ def extract_text(

font_list = [("tiro", None)]
noto = None
if lang_out in resfont_map: # CJK
resfont = resfont_map[lang_out]
if lang_out.lower() in resfont_map: # CJK
resfont = resfont_map[lang_out.lower()]
font_list.append((resfont, None))
elif lang_out in noto_list: # noto
elif lang_out.lower() in noto_list: # noto
resfont = "noto"
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
if not os.path.exists(ttf_path):
Expand All @@ -128,7 +131,7 @@ def extract_text(
)
font_list.append(("noto", ttf_path))
noto = pymupdf.Font("noto", ttf_path)
else: # auto
else: # fallback
resfont = "china-ss"
font_list.append(("china-ss", None))

Expand Down Expand Up @@ -240,14 +243,14 @@ def create_parser() -> argparse.ArgumentParser:
"--lang-in",
"-li",
type=str,
default="auto",
default="en",
help="The code of source language.",
)
parse_params.add_argument(
"--lang-out",
"-lo",
type=str,
default="auto",
default="zh",
help="The code of target language.",
)
parse_params.add_argument(
Expand Down
24 changes: 3 additions & 21 deletions pdf2zh/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,9 @@ class BaseTranslator:
envs = {}
lang_map = {}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
lang_out = self.lang_map.get(lang_out, lang_out)
lang_in = self.lang_map.get(lang_in, lang_in)
def __init__(self, service, lang_out: str, lang_in: str, model):
lang_out = self.lang_map.get(lang_out.lower(), lang_out)
lang_in = self.lang_map.get(lang_in.lower(), lang_in)
self.service = service
self.lang_out = lang_out
self.lang_in = lang_in
Expand Down Expand Up @@ -59,8 +57,6 @@ class GoogleTranslator(BaseTranslator):
lang_map = {"zh": "zh-CN"}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.session = requests.Session()
self.endpoint = "http://translate.google.com/m"
Expand Down Expand Up @@ -92,8 +88,6 @@ class BingTranslator(BaseTranslator):
lang_map = {"zh": "zh-Hans"}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-Hans" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.session = requests.Session()
self.endpoint = "https://www.bing.com/ttranslatev3"
Expand Down Expand Up @@ -136,8 +130,6 @@ class TencentTranslator(BaseTranslator):
}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
cred = credential.DefaultCredentialProvider().get_credential()
self.client = TmtClient(cred, "ap-beijing")
Expand All @@ -162,8 +154,6 @@ class DeepLTranslator(BaseTranslator):
lang_map = {"zh": "zh-Hans"}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.session = requests.Session()
server_url = os.getenv("DEEPL_SERVER_URL")
Expand All @@ -186,8 +176,6 @@ class DeepLXTranslator(BaseTranslator):
lang_map = {"zh": "zh-Hans"}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.endpoint = os.getenv("DEEPLX_ENDPOINT")
self.session = requests.Session()
Expand All @@ -213,8 +201,6 @@ class OllamaTranslator(BaseTranslator):
}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
if not model:
model = os.getenv("OLLAMA_MODEL", self.envs["OLLAMA_MODEL"])
super().__init__(service, lang_out, lang_in, model)
Expand All @@ -240,8 +226,6 @@ class OpenAITranslator(BaseTranslator):
}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
if not model:
model = os.getenv("OPENAI_MODEL", self.envs["OPENAI_MODEL"])
super().__init__(service, lang_out, lang_in, model)
Expand All @@ -267,8 +251,6 @@ class AzureTranslator(BaseTranslator):
lang_map = {"zh": "zh-Hans"}

def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-Hans" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
endpoint = os.environ["AZURE_ENDPOINT"]
api_key = os.environ["AZURE_APIKEY"]
Expand Down

0 comments on commit 6981951

Please sign in to comment.