Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
Byaidu authored Dec 28, 2024
2 parents 3e47b8d + 246e3a1 commit c9d9e67
Show file tree
Hide file tree
Showing 10 changed files with 522 additions and 256 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/python-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ name: Test and Build Python Package

on:
push:
branches:
- main
pull_request:

jobs:
Expand Down
34 changes: 19 additions & 15 deletions docs/README_ja-JP.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[English](../README.md) | [简体中文](README_zh-CN.md) | 日本語

<img src="./docs/images/banner.png" width="320px" alt="PDF2ZH"/>
<img src="./images/banner.png" width="320px" alt="PDF2ZH"/>

<h2 id="title">PDFMathTranslate</h2>

Expand Down Expand Up @@ -56,7 +56,7 @@
<h2 id="preview">プレビュー</h2>

<div align="center">
<img src="./docs/images/preview.gif" width="80%"/>
<img src="./images/preview.gif" width="80%"/>
</div>

<h2 id="demo">公共サービス 🌟</h2>
Expand Down Expand Up @@ -122,9 +122,9 @@ Python環境を事前にインストールする必要はありません
http://localhost:7860/
```

<img src="./docs/images/gui.gif" width="500"/>
<img src="./images/gui.gif" width="500"/>

詳細については、[GUIのドキュメント](./docs/README_GUI.md) を参照してください。
詳細については、[GUIのドキュメント](./README_GUI.md) を参照してください。

<h3 id="docker">方法4. Docker</h3>

Expand Down Expand Up @@ -158,7 +158,7 @@ Python環境を事前にインストールする必要はありません

コマンドラインで翻訳コマンドを実行し、現在の作業ディレクトリに翻訳されたドキュメント `example-mono.pdf` とバイリンガルドキュメント `example-dual.pdf` を生成します。デフォルトではGoogle翻訳サービスを使用します。

<img src="./docs/images/cmd.explained.png" width="580px" alt="cmd"/>
<img src="./images/cmd.explained.png" width="580px" alt="cmd"/>

以下の表に、参考のためにすべての高度なオプションをリストしました:

Expand Down Expand Up @@ -263,16 +263,18 @@ pdf2zh example.pdf -f "(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|LCIRCLE|TeX-|
pdf2zh example.pdf -t 1
```
<h3 id="prompt">custom prompt</h3>
(need Japenese translation)
Use `--prompt` to specify which prompt to use in llm:
<h3 id="prompt">カスタム プロンプト</h3>
`--prompt`を使用して、LLMで使用するプロンプトを指定します:
```bash
pdf2zh example.pdf -pr prompt.txt
```
example prompt.txt
```
`prompt.txt`の例:
```txt
[
{
"role": "system",
Expand All @@ -286,12 +288,14 @@ example prompt.txt
```
In custom prompt file, there are three variables can be used.
|**variables**|**comment**|
カスタムプロンプトファイルでは、以下の3つの変数が使用できます。
|**変数**|**内容**|
|-|-|
|`lang_in`|input language|
|`lang_out`|output language|
|`text`|text need to be translated|
|`lang_in`|ソース言語|
|`lang_out`|ターゲット言語|
|`text`|翻訳するテキスト|
<h2 id="todo">API</h2>
### Python
Expand Down
10 changes: 5 additions & 5 deletions docs/README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[English](../README.md) | 简体中文 | [日本語](README_ja-JP.md)

<img src="./docs/images/banner.png" width="320px" alt="PDF2ZH"/>
<img src="./images/banner.png" width="320px" alt="PDF2ZH"/>

<h2 id="title">PDFMathTranslate</h2>

Expand Down Expand Up @@ -56,7 +56,7 @@
<h2 id="preview">效果预览</h2>

<div align="center">
<img src="./docs/images/preview.gif" width="80%"/>
<img src="./images/preview.gif" width="80%"/>
</div>

<h2 id="demo">在线演示 🌟</h2>
Expand Down Expand Up @@ -121,9 +121,9 @@ set HF_ENDPOINT=https://hf-mirror.com
http://localhost:7860/
```

<img src="./docs/images/gui.gif" width="500"/>
<img src="./images/gui.gif" width="500"/>

查看 [documentation for GUI](./docs/README_GUI.md) 获取细节说明
查看 [documentation for GUI](/README_GUI.md) 获取细节说明

<h3 id="docker">方法四、容器化部署</h3>

Expand Down Expand Up @@ -157,7 +157,7 @@ set HF_ENDPOINT=https://hf-mirror.com

在命令行中执行翻译命令,在当前工作目录下生成译文文档 `example-mono.pdf` 和双语对照文档 `example-dual.pdf`,默认使用 Google 翻译服务

<img src="./docs/images/cmd.explained.png" width="580px" alt="cmd"/>
<img src="./images/cmd.explained.png" width="580px" alt="cmd"/>

我们在下表中列出了所有高级选项,以供参考:

Expand Down
230 changes: 140 additions & 90 deletions pdf2zh/cache.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,141 @@
import tempfile
import os
import time
import hashlib
import shutil

cache_dir = os.path.join(tempfile.gettempdir(), "cache")
os.makedirs(cache_dir, exist_ok=True)
time_filename = "update_time"
max_cache = 5


def deterministic_hash(obj):
hash_object = hashlib.sha256()
hash_object.update(str(obj).encode())
return hash_object.hexdigest()[0:20]


def get_dirs():
dirs = [
os.path.join(cache_dir, dir)
for dir in os.listdir(cache_dir)
if os.path.isdir(os.path.join(cache_dir, dir))
]
return dirs


def get_time(dir):
try:
timefile = os.path.join(dir, time_filename)
t = float(open(timefile, encoding="utf-8").read())
return t
except FileNotFoundError:
# handle the error as needed, for now we'll just return a default value
return float(
"inf"
) # This ensures that this directory will be the first to be removed if required


def write_time(dir):
timefile = os.path.join(dir, time_filename)
t = time.time()
print(t, file=open(timefile, "w", encoding="utf-8"), end="")


def argmin(iterable):
return min(enumerate(iterable), key=lambda x: x[1])[0]


def remove_extra():
dirs = get_dirs()
for dir in dirs:
if not os.path.isdir(
dir
): # This line might be redundant now, as get_dirs() ensures only directories are returned
os.remove(dir)
try:
get_time(dir)
except BaseException:
shutil.rmtree(dir)
while True:
dirs = get_dirs()
if len(dirs) <= max_cache:
break
times = [get_time(dir) for dir in dirs]
arg = argmin(times)
shutil.rmtree(dirs[arg])


def is_cached(hash_key):
dir = os.path.join(cache_dir, hash_key)
return os.path.exists(dir)


def create_cache(hash_key):
dir = os.path.join(cache_dir, hash_key)
os.makedirs(dir, exist_ok=True)
write_time(dir)


def load_paragraph(hash_key, hash_key_paragraph):
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
if os.path.exists(filename):
return open(filename, encoding="utf-8").read()
else:
return None


def write_paragraph(hash_key, hash_key_paragraph, paragraph):
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
print(paragraph, file=open(filename, "w", encoding="utf-8"), end="")
import json
from peewee import Model, SqliteDatabase, AutoField, CharField, TextField, SQL
from typing import Optional


# we don't init the database here
db = SqliteDatabase(None)


class _TranslationCache(Model):
id = AutoField()
translate_engine = CharField(max_length=20)
translate_engine_params = TextField()
original_text = TextField()
translation = TextField()

class Meta:
database = db
constraints = [
SQL(
"""
UNIQUE (
translate_engine,
translate_engine_params,
original_text
)
ON CONFLICT REPLACE
"""
)
]


class TranslationCache:
@staticmethod
def _sort_dict_recursively(obj):
if isinstance(obj, dict):
return {
k: TranslationCache._sort_dict_recursively(v)
for k in sorted(obj.keys())
for v in [obj[k]]
}
elif isinstance(obj, list):
return [TranslationCache._sort_dict_recursively(item) for item in obj]
return obj

def __init__(self, translate_engine: str, translate_engine_params: dict = None):
assert (
len(translate_engine) < 20
), "current cache require translate engine name less than 20 characters"
self.translate_engine = translate_engine
self.replace_params(translate_engine_params)

# The program typically starts multi-threaded translation
# only after cache parameters are fully configured,
# so thread safety doesn't need to be considered here.
def replace_params(self, params: dict = None):
if params is None:
params = {}
self.params = params
params = self._sort_dict_recursively(params)
self.translate_engine_params = json.dumps(params)

def update_params(self, params: dict = None):
if params is None:
params = {}
self.params.update(params)
self.replace_params(self.params)

def add_params(self, k: str, v):
self.params[k] = v
self.replace_params(self.params)

# Since peewee and the underlying sqlite are thread-safe,
# get and set operations don't need locks.
def get(self, original_text: str) -> Optional[str]:
result = _TranslationCache.get_or_none(
translate_engine=self.translate_engine,
translate_engine_params=self.translate_engine_params,
original_text=original_text,
)
return result.translation if result else None

def set(self, original_text: str, translation: str):
_TranslationCache.create(
translate_engine=self.translate_engine,
translate_engine_params=self.translate_engine_params,
original_text=original_text,
translation=translation,
)


def init_db(remove_exists=False):
cache_folder = os.path.join(os.path.expanduser("~"), ".cache", "pdf2zh")
os.makedirs(cache_folder, exist_ok=True)
# The current version does not support database migration, so add the version number to the file name.
cache_db_path = os.path.join(cache_folder, "cache.v1.db")
if remove_exists and os.path.exists(cache_db_path):
os.remove(cache_db_path)
db.init(
cache_db_path,
pragmas={
"journal_mode": "wal",
"busy_timeout": 1000,
},
)
db.create_tables([_TranslationCache], safe=True)


def init_test_db():
import tempfile

cache_db_path = tempfile.mktemp(suffix=".db")
test_db = SqliteDatabase(
cache_db_path,
pragmas={
"journal_mode": "wal",
"busy_timeout": 1000,
},
)
test_db.bind([_TranslationCache], bind_refs=False, bind_backrefs=False)
test_db.connect()
test_db.create_tables([_TranslationCache], safe=True)
return test_db


def clean_test_db(test_db):
test_db.drop_tables([_TranslationCache])
test_db.close()
db_path = test_db.database
if os.path.exists(db_path):
os.remove(test_db.database)
wal_path = db_path + "-wal"
if os.path.exists(wal_path):
os.remove(wal_path)
shm_path = db_path + "-shm"
if os.path.exists(shm_path):
os.remove(shm_path)


init_db()
11 changes: 1 addition & 10 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import numpy as np
import unicodedata
from tenacity import retry, wait_fixed
from pdf2zh import cache
from pdf2zh.translator import (
AzureOpenAITranslator,
BaseTranslator,
Expand Down Expand Up @@ -329,21 +328,13 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
############################################################
# B. 段落翻译
log.debug("\n==========[SSTACK]==========\n")
hash_key = cache.deterministic_hash("PDFMathTranslate")
cache.create_cache(hash_key)

@retry(wait=wait_fixed(1))
def worker(s: str): # 多线程翻译
if not s.strip() or re.match(r"^\{v\d+\}$", s): # 空白和公式不翻译
return s
try:
hash_key_paragraph = cache.deterministic_hash(
(s, str(self.translator))
)
new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
if new is None:
new = self.translator.translate(s)
cache.write_paragraph(hash_key, hash_key_paragraph, new)
new = self.translator.translate(s)
return new
except BaseException as e:
if log.isEnabledFor(logging.DEBUG):
Expand Down
Loading

0 comments on commit c9d9e67

Please sign in to comment.