From 1d803b2bcb72e79c95320c7ea086186f70c6ea97 Mon Sep 17 00:00:00 2001 From: hellofinch Date: Wed, 15 Jan 2025 10:27:35 +0800 Subject: [PATCH] merge from main --- .github/workflows/python-build.yml | 104 +- Dockerfile | 52 +- README.md | 528 ++++----- docs/ADVANCED.md | 482 ++++----- docs/README_ja-JP.md | 778 +++++++------- docs/README_zh-CN.md | 776 ++++++------- docs/README_zh-TW.md | 730 ++++++------- pdf2zh/backend.py | 192 ++-- pdf2zh/config.py | 428 ++++---- pdf2zh/converter.py | 1070 +++++++++--------- pdf2zh/doclayout.py | 362 +++---- pdf2zh/gui.py | 1424 ++++++++++++------------ pdf2zh/high_level.py | 794 +++++++------- pdf2zh/pdf2zh.py | 584 +++++----- pdf2zh/pdfinterp.py | 728 ++++++------- pdf2zh/translator.py | 1612 ++++++++++++++-------------- pyproject.toml | 128 +-- script/setup.bat | 57 +- test/test_translator.py | 296 ++--- 19 files changed, 5519 insertions(+), 5606 deletions(-) diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml index 44283199..4e312074 100644 --- a/.github/workflows/python-build.yml +++ b/.github/workflows/python-build.yml @@ -1,52 +1,52 @@ -name: Test and Build Python Package - -on: - push: - pull_request: - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' # avoid bugs in new versions - cache: 'pip' - cache-dependency-path: pyproject.toml - - - name: Install dependencies - run: | - pip3 install -U pip - pip3 install -e .[dev] - - - name: Test - Unit Test - run: | - pytest . - - - name: Test - Translate a PDF file with plain text only - run: - pdf2zh ./test/file/translate.cli.plain.text.pdf -o ./test/file - - - name: Test - Translate a PDF file figure - run: - pdf2zh ./test/file/translate.cli.text.with.figure.pdf -o ./test/file - - # - name: Test - Translate a PDF file with unknown font - # run: - # pdf2zh ./test/file/translate.cli.font.unknown.pdf - - - name: Test - Start GUI and exit - run: - timeout 10 pdf2zh -i || code=$?; if [[ $code -ne 124 && $code -ne 0 ]]; then exit $code; fi - - - name: Build as a package - run: python -m build - - - name: Upload test results - uses: actions/upload-artifact@v4 - with: - name: test-results - path: ./test/file/ +name: Test and Build Python Package + +on: + push: + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' # avoid bugs in new versions + cache: 'pip' + cache-dependency-path: pyproject.toml + + - name: Install dependencies + run: | + pip3 install -U pip + pip3 install -e .[dev] + + - name: Test - Unit Test + run: | + pytest . + + - name: Test - Translate a PDF file with plain text only + run: + pdf2zh ./test/file/translate.cli.plain.text.pdf -o ./test/file + + - name: Test - Translate a PDF file figure + run: + pdf2zh ./test/file/translate.cli.text.with.figure.pdf -o ./test/file + + # - name: Test - Translate a PDF file with unknown font + # run: + # pdf2zh ./test/file/translate.cli.font.unknown.pdf + + - name: Test - Start GUI and exit + run: + timeout 10 pdf2zh -i || code=$?; if [[ $code -ne 124 && $code -ne 0 ]]; then exit $code; fi + + - name: Build as a package + run: python -m build + + - name: Upload test results + uses: actions/upload-artifact@v4 + with: + name: test-results + path: ./test/file/ diff --git a/Dockerfile b/Dockerfile index 2fe5d723..200c154e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,26 +1,26 @@ -FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim - -WORKDIR /app - - -EXPOSE 7860 - -ENV PYTHONUNBUFFERED=1 - -# Download all required fonts -ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app/ -ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifCN-Regular.ttf" /app/ -ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifTW-Regular.ttf" /app/ -ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifJP-Regular.ttf" /app/ -ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifKR-Regular.ttf" /app/ - -RUN apt-get update && \ - apt-get install --no-install-recommends -y libgl1 && \ - rm -rf /var/lib/apt/lists/* && uv pip install --system --no-cache huggingface-hub && \ - python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('wybxc/DocLayout-YOLO-DocStructBench-onnx','doclayout_yolo_docstructbench_imgsz1024.onnx');" - -COPY . . - -RUN uv pip install --system --no-cache . - -CMD ["pdf2zh", "-i"] +FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim + +WORKDIR /app + + +EXPOSE 7860 + +ENV PYTHONUNBUFFERED=1 + +# Download all required fonts +ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app/ +ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifCN-Regular.ttf" /app/ +ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifTW-Regular.ttf" /app/ +ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifJP-Regular.ttf" /app/ +ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerifKR-Regular.ttf" /app/ + +RUN apt-get update && \ + apt-get install --no-install-recommends -y libgl1 && \ + rm -rf /var/lib/apt/lists/* && uv pip install --system --no-cache huggingface-hub && \ + python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('wybxc/DocLayout-YOLO-DocStructBench-onnx','doclayout_yolo_docstructbench_imgsz1024.onnx');" + +COPY . . + +RUN uv pip install --system --no-cache . + +CMD ["pdf2zh", "-i"] diff --git a/README.md b/README.md index 4a5986b1..5fd04d89 100644 --- a/README.md +++ b/README.md @@ -1,264 +1,264 @@ -
- -English | [简体中文](docs/README_zh-CN.md) | [繁體中文](docs/README_zh-TW.md) | [日本語](docs/README_ja-JP.md) - -PDF2ZH - -

PDFMathTranslate

- -

- - - - - - - - - - - - - - - - - - - - -

- -Byaidu%2FPDFMathTranslate | Trendshift - -
- -PDF scientific paper translation and bilingual comparison. - -- 📊 Preserve formulas, charts, table of contents, and annotations _([preview](#preview))_. -- 🌐 Support [multiple languages](#language), and diverse [translation services](#services). -- 🤖 Provides [commandline tool](#usage), [interactive user interface](#gui), and [Docker](#docker) - -Feel free to provide feedback in [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues), [Telegram Group](https://t.me/+Z9_SgnxmsmA5NzBl) or [QQ Group](https://qm.qq.com/q/DixZCxQej0). - -For details on how to contribute, please consult the [Contribution Guide](https://github.com/Byaidu/PDFMathTranslate/wiki/Contribution-Guide---%E8%B4%A1%E7%8C%AE%E6%8C%87%E5%8D%97). - -

Updates

- -- [Dec. 24 2024] The translator now supports local models on [Xinference](https://github.com/xorbitsai/inference) _(by [@imClumsyPanda](https://github.com/imClumsyPanda))_ -- [Dec. 19 2024] Non-PDF/A documents are now supported using `-cp` _(by [@reycn](https://github.com/reycn))_ -- [Dec. 13 2024] Additional support for backend by _(by [@YadominJinta](https://github.com/YadominJinta))_ -- [Dec. 10 2024] The translator now supports OpenAI models on Azure _(by [@yidasanqian](https://github.com/yidasanqian))_ - -

Preview

- -
- -
- -

Online Service 🌟

- -You can try our application out using either of the following demos: - -- [Public free service](https://pdf2zh.com/) online without installation _(recommended)_. -- [Demo hosted on HuggingFace](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker) -- [Demo hosted on ModelScope](https://www.modelscope.cn/studios/AI-ModelScope/PDFMathTranslate) without installation. - -Note that the computing resources of the demo are limited, so please avoid abusing them. - -

Installation and Usage

- -### Methods - -For different use cases, we provide four distinct methods to use our program: - -
- 1. Commandline - -1. Python installed (3.8 <= version <= 3.12) -2. Install our package: - - ```bash - pip install pdf2zh - ``` - -3. Execute translation, files generated in [current working directory](https://chatgpt.com/share/6745ed36-9acc-800e-8a90-59204bd13444): - - ```bash - pdf2zh document.pdf - ``` - -
- -
- 2. Portable (w/o Python installed) - -1. Download [setup.bat](https://raw.githubusercontent.com/Byaidu/PDFMathTranslate/refs/heads/main/script/setup.bat) - -2. Double-click to run. - -
- -
- 3. Graphic user interface -1. Python installed (3.8 <= version <= 3.12) -2. Install our package: - -```bash -pip install pdf2zh -``` - -3. Start using in browser: - - ```bash - pdf2zh -i - ``` - -4. If your browswer has not been started automatically, goto - - ```bash - http://localhost:7860/ - ``` - - - -See [documentation for GUI](./docs/README_GUI.md) for more details. - -
- -
- 4. Docker - -1. Pull and run: - - ```bash - docker pull byaidu/pdf2zh - docker run -d -p 7860:7860 byaidu/pdf2zh - ``` - -2. Open in browser: - - ``` - http://localhost:7860/ - ``` - -For docker deployment on cloud service: - -
- - Deploy - - Deploy to Koyeb - - Deploy on Zeabur - - Deploy to Koyeb -
- -
- -### Unable to install? - -The present program needs an AI model(`wybxc/DocLayout-YOLO-DocStructBench-onnx`) before working and some users are not able to download due to network issues. If you have a problem with downloading this model, we provide a workaround using the following environment variable: - -```shell -set HF_ENDPOINT=https://hf-mirror.com -``` - -For PowerShell user: -```shell -$env:HF_ENDPOINT = https://hf-mirror.com -``` - -If the solution does not work to you / you encountered other issues, please refer to [frequently asked questions](https://github.com/Byaidu/PDFMathTranslate/wiki#-faq--%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98). - -

Advanced Options

- -Execute the translation command in the command line to generate the translated document `example-mono.pdf` and the bilingual document `example-dual.pdf` in the current working directory. Use Google as the default translation service. More support translation services can find [HERE](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#services). - -cmd - -In the following table, we list all advanced options for reference: - -| Option | Function | Example | -| -------------- | ------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | -| files | Local files | `pdf2zh ~/local.pdf` | -| links | Online files | `pdf2zh http://arxiv.org/paper.pdf` | -| `-i` | [Enter GUI](#gui) | `pdf2zh -i` | -| `-p` | [Partial document translation](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#partial) | `pdf2zh example.pdf -p 1` | -| `-li` | [Source language](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#languages) | `pdf2zh example.pdf -li en` | -| `-lo` | [Target language](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#languages) | `pdf2zh example.pdf -lo zh` | -| `-s` | [Translation service](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#services) | `pdf2zh example.pdf -s deepl` | -| `-t` | [Multi-threads](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#threads) | `pdf2zh example.pdf -t 1` | -| `-o` | Output dir | `pdf2zh example.pdf -o output` | -| `-f`, `-c` | [Exceptions](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` | -| `-cp` | Compatibility Mode | `pdf2zh example.pdf --compatible` | -| `--share` | Public link | `pdf2zh -i --share` | -| `--authorized` | [Authorization](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#auth) | `pdf2zh -i --authorized users.txt [auth.html]` | -| `--prompt` | [Custom Prompt](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#prompt) | `pdf2zh --prompt [prompt.txt]` | -| `--onnx` | [Use Custom DocLayout-YOLO ONNX model] | `pdf2zh --onnx [onnx/model/path]` | -| `--serverport` | [Use Custom WebUI port] | `pdf2zh --serverport 7860` | -| `--dir` | [batch translate] | `pdf2zh --dir /path/to/translate/` | -| `--config` | [configuration file](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#cofig) | `pdf2zh --config /path/to/config/config.json` | -| `--serverport` | [custom gradio server port] | `pdf2zh --serverport 7860` | - -For detailed explanations, please refer to our document about [Advanced Usage](./docs/ADVANCED.md) for a full list of each option. - -

Secondary Development (APIs)

- -For downstream applications, please refer to our document about [API Details](./docs/APIS.md) for futher information about: - -- [Python API](./docs/APIS.md#api-python), how to use the program in other Python programs -- [HTTP API](./docs/APIS.md#api-http), how to communicate with a server with the program installed - -

TODOs

- -- [ ] Parse layout with DocLayNet based models, [PaddleX](https://github.com/PaddlePaddle/PaddleX/blob/17cc27ac3842e7880ca4aad92358d3ef8555429a/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py#L81), [PaperMage](https://github.com/allenai/papermage/blob/9cd4bb48cbedab45d0f7a455711438f1632abebe/README.md?plain=1#L102), [SAM2](https://github.com/facebookresearch/sam2) - -- [ ] Fix page rotation, table of contents, format of lists - -- [ ] Fix pixel formula in old papers - -- [ ] Async retry except KeyboardInterrupt - -- [ ] Knuth–Plass algorithm for western languages - -- [ ] Support non-PDF/A files - -- [ ] Plugins of [Zotero](https://github.com/zotero/zotero) and [Obsidian](https://github.com/obsidianmd/obsidian-releases) - -

Acknowledgements

- -- Document merging: [PyMuPDF](https://github.com/pymupdf/PyMuPDF) - -- Document parsing: [Pdfminer.six](https://github.com/pdfminer/pdfminer.six) - -- Document extraction: [MinerU](https://github.com/opendatalab/MinerU) - -- Document Preview: [Gradio PDF](https://github.com/freddyaboulton/gradio-pdf) - -- Multi-threaded translation: [MathTranslate](https://github.com/SUSYUSTC/MathTranslate) - -- Layout parsing: [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) - -- Document standard: [PDF Explained](https://zxyle.github.io/PDF-Explained/), [PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/) - -- Multilingual Font: [Go Noto Universal](https://github.com/satbyy/go-noto-universal) - -

Contributors

- - - - - -![Alt](https://repobeats.axiom.co/api/embed/dfa7583da5332a11468d686fbd29b92320a6a869.svg "Repobeats analytics image") - -

Star History

- - - - - - Star History Chart - - +
+ +English | [简体中文](docs/README_zh-CN.md) | [繁體中文](docs/README_zh-TW.md) | [日本語](docs/README_ja-JP.md) + +PDF2ZH + +

PDFMathTranslate

+ +

+ + + + + + + + + + + + + + + + + + + + +

+ +Byaidu%2FPDFMathTranslate | Trendshift + +
+ +PDF scientific paper translation and bilingual comparison. + +- 📊 Preserve formulas, charts, table of contents, and annotations _([preview](#preview))_. +- 🌐 Support [multiple languages](#language), and diverse [translation services](#services). +- 🤖 Provides [commandline tool](#usage), [interactive user interface](#gui), and [Docker](#docker) + +Feel free to provide feedback in [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues), [Telegram Group](https://t.me/+Z9_SgnxmsmA5NzBl) or [QQ Group](https://qm.qq.com/q/DixZCxQej0). + +For details on how to contribute, please consult the [Contribution Guide](https://github.com/Byaidu/PDFMathTranslate/wiki/Contribution-Guide---%E8%B4%A1%E7%8C%AE%E6%8C%87%E5%8D%97). + +

Updates

+ +- [Dec. 24 2024] The translator now supports local models on [Xinference](https://github.com/xorbitsai/inference) _(by [@imClumsyPanda](https://github.com/imClumsyPanda))_ +- [Dec. 19 2024] Non-PDF/A documents are now supported using `-cp` _(by [@reycn](https://github.com/reycn))_ +- [Dec. 13 2024] Additional support for backend by _(by [@YadominJinta](https://github.com/YadominJinta))_ +- [Dec. 10 2024] The translator now supports OpenAI models on Azure _(by [@yidasanqian](https://github.com/yidasanqian))_ + +

Preview

+ +
+ +
+ +

Online Service 🌟

+ +You can try our application out using either of the following demos: + +- [Public free service](https://pdf2zh.com/) online without installation _(recommended)_. +- [Demo hosted on HuggingFace](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker) +- [Demo hosted on ModelScope](https://www.modelscope.cn/studios/AI-ModelScope/PDFMathTranslate) without installation. + +Note that the computing resources of the demo are limited, so please avoid abusing them. + +

Installation and Usage

+ +### Methods + +For different use cases, we provide four distinct methods to use our program: + +
+ 1. Commandline + +1. Python installed (3.8 <= version <= 3.12) +2. Install our package: + + ```bash + pip install pdf2zh + ``` + +3. Execute translation, files generated in [current working directory](https://chatgpt.com/share/6745ed36-9acc-800e-8a90-59204bd13444): + + ```bash + pdf2zh document.pdf + ``` + +
+ +
+ 2. Portable (w/o Python installed) + +1. Download [setup.bat](https://raw.githubusercontent.com/Byaidu/PDFMathTranslate/refs/heads/main/script/setup.bat) + +2. Double-click to run. + +
+ +
+ 3. Graphic user interface +1. Python installed (3.8 <= version <= 3.12) +2. Install our package: + +```bash +pip install pdf2zh +``` + +3. Start using in browser: + + ```bash + pdf2zh -i + ``` + +4. If your browswer has not been started automatically, goto + + ```bash + http://localhost:7860/ + ``` + + + +See [documentation for GUI](./docs/README_GUI.md) for more details. + +
+ +
+ 4. Docker + +1. Pull and run: + + ```bash + docker pull byaidu/pdf2zh + docker run -d -p 7860:7860 byaidu/pdf2zh + ``` + +2. Open in browser: + + ``` + http://localhost:7860/ + ``` + +For docker deployment on cloud service: + +
+ + Deploy + + Deploy to Koyeb + + Deploy on Zeabur + + Deploy to Koyeb +
+ +
+ +### Unable to install? + +The present program needs an AI model(`wybxc/DocLayout-YOLO-DocStructBench-onnx`) before working and some users are not able to download due to network issues. If you have a problem with downloading this model, we provide a workaround using the following environment variable: + +```shell +set HF_ENDPOINT=https://hf-mirror.com +``` + +For PowerShell user: +```shell +$env:HF_ENDPOINT = https://hf-mirror.com +``` + +If the solution does not work to you / you encountered other issues, please refer to [frequently asked questions](https://github.com/Byaidu/PDFMathTranslate/wiki#-faq--%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98). + +

Advanced Options

+ +Execute the translation command in the command line to generate the translated document `example-mono.pdf` and the bilingual document `example-dual.pdf` in the current working directory. Use Google as the default translation service. More support translation services can find [HERE](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#services). + +cmd + +In the following table, we list all advanced options for reference: + +| Option | Function | Example | +| -------------- | ------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | +| files | Local files | `pdf2zh ~/local.pdf` | +| links | Online files | `pdf2zh http://arxiv.org/paper.pdf` | +| `-i` | [Enter GUI](#gui) | `pdf2zh -i` | +| `-p` | [Partial document translation](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#partial) | `pdf2zh example.pdf -p 1` | +| `-li` | [Source language](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#languages) | `pdf2zh example.pdf -li en` | +| `-lo` | [Target language](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#languages) | `pdf2zh example.pdf -lo zh` | +| `-s` | [Translation service](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#services) | `pdf2zh example.pdf -s deepl` | +| `-t` | [Multi-threads](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#threads) | `pdf2zh example.pdf -t 1` | +| `-o` | Output dir | `pdf2zh example.pdf -o output` | +| `-f`, `-c` | [Exceptions](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` | +| `-cp` | Compatibility Mode | `pdf2zh example.pdf --compatible` | +| `--share` | Public link | `pdf2zh -i --share` | +| `--authorized` | [Authorization](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#auth) | `pdf2zh -i --authorized users.txt [auth.html]` | +| `--prompt` | [Custom Prompt](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#prompt) | `pdf2zh --prompt [prompt.txt]` | +| `--onnx` | [Use Custom DocLayout-YOLO ONNX model] | `pdf2zh --onnx [onnx/model/path]` | +| `--serverport` | [Use Custom WebUI port] | `pdf2zh --serverport 7860` | +| `--dir` | [batch translate] | `pdf2zh --dir /path/to/translate/` | +| `--config` | [configuration file](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#cofig) | `pdf2zh --config /path/to/config/config.json` | +| `--serverport` | [custom gradio server port] | `pdf2zh --serverport 7860` | + +For detailed explanations, please refer to our document about [Advanced Usage](./docs/ADVANCED.md) for a full list of each option. + +

Secondary Development (APIs)

+ +For downstream applications, please refer to our document about [API Details](./docs/APIS.md) for futher information about: + +- [Python API](./docs/APIS.md#api-python), how to use the program in other Python programs +- [HTTP API](./docs/APIS.md#api-http), how to communicate with a server with the program installed + +

TODOs

+ +- [ ] Parse layout with DocLayNet based models, [PaddleX](https://github.com/PaddlePaddle/PaddleX/blob/17cc27ac3842e7880ca4aad92358d3ef8555429a/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py#L81), [PaperMage](https://github.com/allenai/papermage/blob/9cd4bb48cbedab45d0f7a455711438f1632abebe/README.md?plain=1#L102), [SAM2](https://github.com/facebookresearch/sam2) + +- [ ] Fix page rotation, table of contents, format of lists + +- [ ] Fix pixel formula in old papers + +- [ ] Async retry except KeyboardInterrupt + +- [ ] Knuth–Plass algorithm for western languages + +- [ ] Support non-PDF/A files + +- [ ] Plugins of [Zotero](https://github.com/zotero/zotero) and [Obsidian](https://github.com/obsidianmd/obsidian-releases) + +

Acknowledgements

+ +- Document merging: [PyMuPDF](https://github.com/pymupdf/PyMuPDF) + +- Document parsing: [Pdfminer.six](https://github.com/pdfminer/pdfminer.six) + +- Document extraction: [MinerU](https://github.com/opendatalab/MinerU) + +- Document Preview: [Gradio PDF](https://github.com/freddyaboulton/gradio-pdf) + +- Multi-threaded translation: [MathTranslate](https://github.com/SUSYUSTC/MathTranslate) + +- Layout parsing: [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) + +- Document standard: [PDF Explained](https://zxyle.github.io/PDF-Explained/), [PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/) + +- Multilingual Font: [Go Noto Universal](https://github.com/satbyy/go-noto-universal) + +

Contributors

+ + + + + +![Alt](https://repobeats.axiom.co/api/embed/dfa7583da5332a11468d686fbd29b92320a6a869.svg "Repobeats analytics image") + +

Star History

+ + + + + + Star History Chart + + diff --git a/docs/ADVANCED.md b/docs/ADVANCED.md index 513c975a..8ff4f89e 100644 --- a/docs/ADVANCED.md +++ b/docs/ADVANCED.md @@ -1,242 +1,242 @@ -[**Documentation**](https://github.com/Byaidu/PDFMathTranslate) > **Advanced Usage** _(current)_ - ---- - -

Table of Contents

- -- [Full / partial translation](#partial) -- [Specify source and target languages](#language) -- [Translate with different services](#services) -- [Translate wih exceptions](#exceptions) -- [Multi-threads](#threads) -- [Custom prompt](#prompt) - ---- - -

Full / partial translation

- -- Entire document - - ```bash - pdf2zh example.pdf - ``` - -- Part of the document - - ```bash - pdf2zh example.pdf -p 1-3,5 - ``` - -[⬆️ Back to top](#toc) - ---- - -

Specify source and target languages

- -See [Google Languages Codes](https://developers.google.com/admin-sdk/directory/v1/languages), [DeepL Languages Codes](https://developers.deepl.com/docs/resources/supported-languages) - -```bash -pdf2zh example.pdf -li en -lo ja -``` - -[⬆️ Back to top](#toc) - ---- - -

Translate with different services

- -We've provided a detailed table on the required [environment variables](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4) for each translation service. Make sure to set them before using the respective service. - -| **Translator** | **Service** | **Environment Variables** | **Default Values** | **Notes** | -|----------------------|----------------|-----------------------------------------------------------------------|----------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **Google (Default)** | `google` | None | N/A | None | -| **Bing** | `bing` | None | N/A | None | -| **DeepL** | `deepl` | `DEEPL_AUTH_KEY` | `[Your Key]` | See [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API) | -| **DeepLX** | `deeplx` | `DEEPLX_ENDPOINT` | `https://api.deepl.com/translate` | See [DeepLX](https://github.com/OwO-Network/DeepLX) | -| **Ollama** | `ollama` | `OLLAMA_HOST`, `OLLAMA_MODEL` | `http://127.0.0.1:11434`, `gemma2` | See [Ollama](https://github.com/ollama/ollama) | -| **Xinference** | `xinference` | `XINFERENCE_HOST`, `XINFERENCE_MODEL` | `http://127.0.0.1:9997`, `gemma-2-it` | See [Xinference](https://github.com/xorbitsai/inference) | -| **OpenAI** | `openai` | `OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL` | `https://api.openai.com/v1`, `[Your Key]`, `gpt-4o-mini` | See [OpenAI](https://platform.openai.com/docs/overview) | -| **AzureOpenAI** | `azure-openai` | `AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_MODEL` | `[Your Endpoint]`, `[Your Key]`, `gpt-4o-mini` | See [Azure OpenAI](https://learn.microsoft.com/zh-cn/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython&pivots=programming-language-python) | -| **Zhipu** | `zhipu` | `ZHIPU_API_KEY`, `ZHIPU_MODEL` | `[Your Key]`, `glm-4-flash` | See [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk) | -| **ModelScope** | `ModelScope` | `MODELSCOPE_API_KEY`, `MODELSCOPE_MODEL` | `[Your Key]`, `Qwen/Qwen2.5-Coder-32B-Instruct` | See [ModelScope](https://www.modelscope.cn/docs/model-service/API-Inference/intro) | -| **Silicon** | `silicon` | `SILICON_API_KEY`, `SILICON_MODEL` | `[Your Key]`, `Qwen/Qwen2.5-7B-Instruct` | See [SiliconCloud](https://docs.siliconflow.cn/quickstart) | -| **Gemini** | `gemini` | `GEMINI_API_KEY`, `GEMINI_MODEL` | `[Your Key]`, `gemini-1.5-flash` | See [Gemini](https://ai.google.dev/gemini-api/docs/openai) | -| **Azure** | `azure` | `AZURE_ENDPOINT`, `AZURE_API_KEY` | `https://api.translator.azure.cn`, `[Your Key]` | See [Azure](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview) | -| **Tencent** | `tencent` | `TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY` | `[Your ID]`, `[Your Key]` | See [Tencent](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104) | -| **Dify** | `dify` | `DIFY_API_URL`, `DIFY_API_KEY` | `[Your DIFY URL]`, `[Your Key]` | See [Dify](https://github.com/langgenius/dify),Three variables, lang_out, lang_in, and text, need to be defined in Dify's workflow input. | -| **AnythingLLM** | `anythingllm` | `AnythingLLM_URL`, `AnythingLLM_APIKEY` | `[Your AnythingLLM URL]`, `[Your Key]` | See [anything-llm](https://github.com/Mintplex-Labs/anything-llm) | -|**Argos Translate**|`argos`| | |See [argos-translate](https://github.com/argosopentech/argos-translate)| -|**Grok**|`grok`| `GORK_API_KEY`, `GORK_MODEL` | `[Your GORK_API_KEY]`, `grok-2-1212` |See [Grok](https://docs.x.ai/docs/overview)| -|**Groq**|`groq`| `GROQ_API_KEY`, `GROQ_MODEL` | `[Your GROQ_API_KEY]`, `llama-3-3-70b-versatile` |See [Groq](https://console.groq.com/docs/models)| -|**DeepSeek**|`deepseek`| `DEEPSEEK_API_KEY`, `DEEPSEEK_MODEL` | `[Your DEEPSEEK_API_KEY]`, `deepseek-chat` |See [DeepSeek](https://www.deepseek.com/)| -|**OpenAI-Liked**|`openai-liked`| `OPENAILIKE_BASE_URL`, `OPENAILIKE_API_KEY`, `OPENAILIKE_MODEL` | `url`, `[Your Key]`, `model name` | None | - -For large language models that are compatible with the OpenAI API but not listed in the table above, you can set environment variables using the same method outlined for OpenAI in the table. - -Use `-s service` or `-s service:model` to specify service: - -```bash -pdf2zh example.pdf -s openai:gpt-4o-mini -``` - -Or specify model with environment variables: - -```bash -set OPENAI_MODEL=gpt-4o-mini -pdf2zh example.pdf -s openai -``` - -For PowerShell user: -```shell -$env:OPENAI_MODEL = gpt-4o-mini -pdf2zh example.pdf -s openai -``` - -[⬆️ Back to top](#toc) - ---- - -

Translate wih exceptions

- -Use regex to specify formula fonts and characters that need to be preserved: - -```bash -pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])" -``` - -Preserve `Latex`, `Mono`, `Code`, `Italic`, `Symbol` and `Math` fonts by default: - -```bash -pdf2zh example.pdf -f "(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)" -``` - -[⬆️ Back to top](#toc) - ---- - -

Multi-threads

- -Use `-t` to specify how many threads to use in translation: - -```bash -pdf2zh example.pdf -t 1 -``` - -[⬆️ Back to top](#toc) - ---- - -

Custom prompt

- -Use `--prompt` to specify which prompt to use in llm: - -```bash -pdf2zh example.pdf --prompt prompt.txt -``` - -example prompt.txt - -``` -[ - { - "role": "system", - "content": "You are a professional,authentic machine translation engine.", - }, - { - "role": "user", - "content": "Translate the following markdown source text to ${lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: ${text}\nTranslated Text:", - }, -] -``` - -In custom prompt file, there are three variables can be used. -|**variables**|**comment**| -|-|-| -|`lang_in`|input language| -|`lang_out`|output language| -|`text`|text need to be translated| - -[⬆️ Back to top](#toc) - ---- - -

Authorization

- -Use `--authorized` to specify which user to use Web UI and custom the login page: - -```bash -pdf2zh example.pdf --authorized users.txt auth.html -``` - -example users.txt -Each line contains two elements, username, and password, separated by a comma. - -``` -admin,123456 -user1,password1 -user2,abc123 -guest,guest123 -test,test123 -``` - -example auth.html - -```html - - - - Simple HTML - - -

Hello, World!

-

Welcome to my simple HTML page.

- - -``` - -[⬆️ Back to top](#toc) - ---- - -

Custom configuration file

- -Use `--config` to specify which file to configure the PDFMathTranslate: - -```bash -pdf2zh example.pdf --config config.json -``` - -```bash -pdf2zh -i --config config.json -``` - -example config.json -```json -{ - "USE_MODELSCOPE": "0", - "PDF2ZH_LANG_FROM": "English", - "PDF2ZH_LANG_TO": "Simplified Chinese", - "NOTO_FONT_PATH": "/app/SourceHanSerifCN-Regular.ttf", - "translators": [ - { - "name": "deeplx", - "envs": { - "DEEPLX_ENDPOINT": "http://localhost:1188/translate/", - "DEEPLX_ACCESS_TOKEN": null - } - }, - { - "name": "ollama", - "envs": { - "OLLAMA_HOST": "http://127.0.0.1:11434", - "OLLAMA_MODEL": "gemma2" - } - } - ] -} -``` -By default, the config file is saved in the `~/.config/PDFMathTranslate/config.json`. The program will start by reading the contents of config.json, and after that it will read the contents of the environment variables. When an environment variable is available, the contents of the environment variable are used first and the file is updated. - -[⬆️ Back to top](#toc) - +[**Documentation**](https://github.com/Byaidu/PDFMathTranslate) > **Advanced Usage** _(current)_ + +--- + +

Table of Contents

+ +- [Full / partial translation](#partial) +- [Specify source and target languages](#language) +- [Translate with different services](#services) +- [Translate wih exceptions](#exceptions) +- [Multi-threads](#threads) +- [Custom prompt](#prompt) + +--- + +

Full / partial translation

+ +- Entire document + + ```bash + pdf2zh example.pdf + ``` + +- Part of the document + + ```bash + pdf2zh example.pdf -p 1-3,5 + ``` + +[⬆️ Back to top](#toc) + +--- + +

Specify source and target languages

+ +See [Google Languages Codes](https://developers.google.com/admin-sdk/directory/v1/languages), [DeepL Languages Codes](https://developers.deepl.com/docs/resources/supported-languages) + +```bash +pdf2zh example.pdf -li en -lo ja +``` + +[⬆️ Back to top](#toc) + +--- + +

Translate with different services

+ +We've provided a detailed table on the required [environment variables](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4) for each translation service. Make sure to set them before using the respective service. + +| **Translator** | **Service** | **Environment Variables** | **Default Values** | **Notes** | +|----------------------|----------------|-----------------------------------------------------------------------|----------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Google (Default)** | `google` | None | N/A | None | +| **Bing** | `bing` | None | N/A | None | +| **DeepL** | `deepl` | `DEEPL_AUTH_KEY` | `[Your Key]` | See [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API) | +| **DeepLX** | `deeplx` | `DEEPLX_ENDPOINT` | `https://api.deepl.com/translate` | See [DeepLX](https://github.com/OwO-Network/DeepLX) | +| **Ollama** | `ollama` | `OLLAMA_HOST`, `OLLAMA_MODEL` | `http://127.0.0.1:11434`, `gemma2` | See [Ollama](https://github.com/ollama/ollama) | +| **Xinference** | `xinference` | `XINFERENCE_HOST`, `XINFERENCE_MODEL` | `http://127.0.0.1:9997`, `gemma-2-it` | See [Xinference](https://github.com/xorbitsai/inference) | +| **OpenAI** | `openai` | `OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL` | `https://api.openai.com/v1`, `[Your Key]`, `gpt-4o-mini` | See [OpenAI](https://platform.openai.com/docs/overview) | +| **AzureOpenAI** | `azure-openai` | `AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_MODEL` | `[Your Endpoint]`, `[Your Key]`, `gpt-4o-mini` | See [Azure OpenAI](https://learn.microsoft.com/zh-cn/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython&pivots=programming-language-python) | +| **Zhipu** | `zhipu` | `ZHIPU_API_KEY`, `ZHIPU_MODEL` | `[Your Key]`, `glm-4-flash` | See [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk) | +| **ModelScope** | `ModelScope` | `MODELSCOPE_API_KEY`, `MODELSCOPE_MODEL` | `[Your Key]`, `Qwen/Qwen2.5-Coder-32B-Instruct` | See [ModelScope](https://www.modelscope.cn/docs/model-service/API-Inference/intro) | +| **Silicon** | `silicon` | `SILICON_API_KEY`, `SILICON_MODEL` | `[Your Key]`, `Qwen/Qwen2.5-7B-Instruct` | See [SiliconCloud](https://docs.siliconflow.cn/quickstart) | +| **Gemini** | `gemini` | `GEMINI_API_KEY`, `GEMINI_MODEL` | `[Your Key]`, `gemini-1.5-flash` | See [Gemini](https://ai.google.dev/gemini-api/docs/openai) | +| **Azure** | `azure` | `AZURE_ENDPOINT`, `AZURE_API_KEY` | `https://api.translator.azure.cn`, `[Your Key]` | See [Azure](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview) | +| **Tencent** | `tencent` | `TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY` | `[Your ID]`, `[Your Key]` | See [Tencent](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104) | +| **Dify** | `dify` | `DIFY_API_URL`, `DIFY_API_KEY` | `[Your DIFY URL]`, `[Your Key]` | See [Dify](https://github.com/langgenius/dify),Three variables, lang_out, lang_in, and text, need to be defined in Dify's workflow input. | +| **AnythingLLM** | `anythingllm` | `AnythingLLM_URL`, `AnythingLLM_APIKEY` | `[Your AnythingLLM URL]`, `[Your Key]` | See [anything-llm](https://github.com/Mintplex-Labs/anything-llm) | +|**Argos Translate**|`argos`| | |See [argos-translate](https://github.com/argosopentech/argos-translate)| +|**Grok**|`grok`| `GORK_API_KEY`, `GORK_MODEL` | `[Your GORK_API_KEY]`, `grok-2-1212` |See [Grok](https://docs.x.ai/docs/overview)| +|**Groq**|`groq`| `GROQ_API_KEY`, `GROQ_MODEL` | `[Your GROQ_API_KEY]`, `llama-3-3-70b-versatile` |See [Groq](https://console.groq.com/docs/models)| +|**DeepSeek**|`deepseek`| `DEEPSEEK_API_KEY`, `DEEPSEEK_MODEL` | `[Your DEEPSEEK_API_KEY]`, `deepseek-chat` |See [DeepSeek](https://www.deepseek.com/)| +|**OpenAI-Liked**|`openai-liked`| `OPENAILIKE_BASE_URL`, `OPENAILIKE_API_KEY`, `OPENAILIKE_MODEL` | `url`, `[Your Key]`, `model name` | None | + +For large language models that are compatible with the OpenAI API but not listed in the table above, you can set environment variables using the same method outlined for OpenAI in the table. + +Use `-s service` or `-s service:model` to specify service: + +```bash +pdf2zh example.pdf -s openai:gpt-4o-mini +``` + +Or specify model with environment variables: + +```bash +set OPENAI_MODEL=gpt-4o-mini +pdf2zh example.pdf -s openai +``` + +For PowerShell user: +```shell +$env:OPENAI_MODEL = gpt-4o-mini +pdf2zh example.pdf -s openai +``` + +[⬆️ Back to top](#toc) + +--- + +

Translate wih exceptions

+ +Use regex to specify formula fonts and characters that need to be preserved: + +```bash +pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])" +``` + +Preserve `Latex`, `Mono`, `Code`, `Italic`, `Symbol` and `Math` fonts by default: + +```bash +pdf2zh example.pdf -f "(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)" +``` + +[⬆️ Back to top](#toc) + +--- + +

Multi-threads

+ +Use `-t` to specify how many threads to use in translation: + +```bash +pdf2zh example.pdf -t 1 +``` + +[⬆️ Back to top](#toc) + +--- + +

Custom prompt

+ +Use `--prompt` to specify which prompt to use in llm: + +```bash +pdf2zh example.pdf --prompt prompt.txt +``` + +example prompt.txt + +``` +[ + { + "role": "system", + "content": "You are a professional,authentic machine translation engine.", + }, + { + "role": "user", + "content": "Translate the following markdown source text to ${lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: ${text}\nTranslated Text:", + }, +] +``` + +In custom prompt file, there are three variables can be used. +|**variables**|**comment**| +|-|-| +|`lang_in`|input language| +|`lang_out`|output language| +|`text`|text need to be translated| + +[⬆️ Back to top](#toc) + +--- + +

Authorization

+ +Use `--authorized` to specify which user to use Web UI and custom the login page: + +```bash +pdf2zh example.pdf --authorized users.txt auth.html +``` + +example users.txt +Each line contains two elements, username, and password, separated by a comma. + +``` +admin,123456 +user1,password1 +user2,abc123 +guest,guest123 +test,test123 +``` + +example auth.html + +```html + + + + Simple HTML + + +

Hello, World!

+

Welcome to my simple HTML page.

+ + +``` + +[⬆️ Back to top](#toc) + +--- + +

Custom configuration file

+ +Use `--config` to specify which file to configure the PDFMathTranslate: + +```bash +pdf2zh example.pdf --config config.json +``` + +```bash +pdf2zh -i --config config.json +``` + +example config.json +```json +{ + "USE_MODELSCOPE": "0", + "PDF2ZH_LANG_FROM": "English", + "PDF2ZH_LANG_TO": "Simplified Chinese", + "NOTO_FONT_PATH": "/app/SourceHanSerifCN-Regular.ttf", + "translators": [ + { + "name": "deeplx", + "envs": { + "DEEPLX_ENDPOINT": "http://localhost:1188/translate/", + "DEEPLX_ACCESS_TOKEN": null + } + }, + { + "name": "ollama", + "envs": { + "OLLAMA_HOST": "http://127.0.0.1:11434", + "OLLAMA_MODEL": "gemma2" + } + } + ] +} +``` +By default, the config file is saved in the `~/.config/PDFMathTranslate/config.json`. The program will start by reading the contents of config.json, and after that it will read the contents of the environment variables. When an environment variable is available, the contents of the environment variable are used first and the file is updated. + +[⬆️ Back to top](#toc) + --- \ No newline at end of file diff --git a/docs/README_ja-JP.md b/docs/README_ja-JP.md index 203d5fdf..9f89662d 100644 --- a/docs/README_ja-JP.md +++ b/docs/README_ja-JP.md @@ -1,389 +1,389 @@ -
- -[English](../README.md) | [简体中文](README_zh-CN.md) | [繁體中文](README_zh-TW.md) | 日本語 - -PDF2ZH - -

PDFMathTranslate

- -

- - - - - - - - - - - - - - - - - - - - -

- -Byaidu%2FPDFMathTranslate | Trendshift - -
- -科学 PDF 文書の翻訳およびバイリンガル比較ツール - -- 📊 数式、チャート、目次、注釈を保持 *([プレビュー](#preview))* -- 🌐 [複数の言語](#language) と [多様な翻訳サービス](#services) をサポート -- 🤖 [コマンドラインツール](#usage)、[インタラクティブユーザーインターフェース](#gui)、および [Docker](#docker) を提供 - -フィードバックは [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues)、[Telegram グループ](https://t.me/+Z9_SgnxmsmA5NzBl) または [QQ グループ](https://qm.qq.com/q/DixZCxQej0) でお気軽にどうぞ - -

最近の更新

- -- [2024年11月26日] CLIがオンラインファイルをサポートするようになりました *(by [@reycn](https://github.com/reycn))* -- [2024年11月24日] 依存関係のサイズを削減するために [ONNX](https://github.com/onnx/onnx) サポートを追加しました *(by [@Wybxc](https://github.com/Wybxc))* -- [2024年11月23日] 🌟 [公共サービス](#demo) がオンラインになりました! *(by [@Byaidu](https://github.com/Byaidu))* -- [2024年11月23日] ウェブボットを防ぐためのファイアウォールを追加しました *(by [@Byaidu](https://github.com/Byaidu))* -- [2024年11月22日] GUIがイタリア語をサポートし、改善されました *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))* -- [2024年11月22日] デプロイされたサービスを他の人と共有できるようになりました *(by [@Zxis233](https://github.com/Zxis233))* -- [2024年11月22日] Tencent翻訳をサポートしました *(by [@hellofinch](https://github.com/hellofinch))* -- [2024年11月21日] GUIがバイリンガルドキュメントのダウンロードをサポートするようになりました *(by [@reycn](https://github.com/reycn))* -- [2024年11月20日] 🌟 [デモ](#demo) がオンラインになりました! *(by [@reycn](https://github.com/reycn))* - -

プレビュー

- -
- -
- -

公共サービス 🌟

- -### 無料サービス () - -インストールなしで [公共サービス](https://pdf2zh.com/) をオンラインで試すことができます。 - -### デモ - -インストールなしで [HuggingFace上のデモ](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker), [ModelScope上のデモ](https://www.modelscope.cn/studios/AI-ModelScope/PDFMathTranslate) を試すことができます。 -デモの計算リソースは限られているため、乱用しないようにしてください。 - -

インストールと使用方法

- -このプロジェクトを使用するための4つの方法を提供しています:[コマンドライン](#cmd)、[ポータブル](#portable)、[GUI](#gui)、および [Docker](#docker)。 - -pdf2zhの実行には追加モデル(`wybxc/DocLayout-YOLO-DocStructBench-onnx`)が必要です。このモデルはModelScopeでも見つけることができます。起動時にこのモデルのダウンロードに問題がある場合は、以下の環境変数を使用してください: - -```shell -set HF_ENDPOINT=https://hf-mirror.com -``` - -For PowerShell user: -```shell -$env:HF_ENDPOINT = https://hf-mirror.com -``` - -

方法1. コマンドライン

- - 1. Pythonがインストールされていること (バージョン3.8 <= バージョン <= 3.12) - 2. パッケージをインストールします: - - ```bash - pip install pdf2zh - ``` - - 3. 翻訳を実行し、[現在の作業ディレクトリ](https://chatgpt.com/share/6745ed36-9acc-800e-8a90-59204bd13444) にファイルを生成します: - - ```bash - pdf2zh document.pdf - ``` - -

方法2. ポータブル

- -Python環境を事前にインストールする必要はありません - -[setup.bat](https://raw.githubusercontent.com/Byaidu/PDFMathTranslate/refs/heads/main/script/setup.bat) をダウンロードしてダブルクリックして実行します - -

方法3. GUI

- -1. Pythonがインストールされていること (バージョン3.8 <= バージョン <= 3.12) -2. パッケージをインストールします: - - ```bash - pip install pdf2zh - ``` - -3. ブラウザで使用を開始します: - - ```bash - pdf2zh -i - ``` - -4. ブラウザが自動的に起動しない場合は、次のURLを開きます: - - ```bash - http://localhost:7860/ - ``` - - - -詳細については、[GUIのドキュメント](./README_GUI.md) を参照してください。 - -

方法4. Docker

- -1. プルして実行します: - - ```bash - docker pull byaidu/pdf2zh - docker run -d -p 7860:7860 byaidu/pdf2zh - ``` - -2. ブラウザで開きます: - - ``` - http://localhost:7860/ - ``` - -クラウドサービスでのDockerデプロイメント用: - -
- - Deploy - - Deploy to Koyeb - - Deploy on Zeabur - - Deploy to Koyeb -
- -

高度なオプション

- -コマンドラインで翻訳コマンドを実行し、現在の作業ディレクトリに翻訳されたドキュメント `example-mono.pdf` とバイリンガルドキュメント `example-dual.pdf` を生成します。デフォルトではGoogle翻訳サービスを使用します。More support translation services can find [HERE](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#services). - - -cmd - -以下の表に、参考のためにすべての高度なオプションをリストしました: - -| オプション | 機能 | 例 | -| -------- | ------- |------- | -| files | ローカルファイル | `pdf2zh ~/local.pdf` | -| links | オンラインファイル | `pdf2zh http://arxiv.org/paper.pdf` | -| `-i` | [GUIに入る](#gui) | `pdf2zh -i` | -| `-p` | [部分的なドキュメント翻訳](#partial) | `pdf2zh example.pdf -p 1` | -| `-li` | [ソース言語](#languages) | `pdf2zh example.pdf -li en` | -| `-lo` | [ターゲット言語](#languages) | `pdf2zh example.pdf -lo zh` | -| `-s` | [翻訳サービス](#services) | `pdf2zh example.pdf -s deepl` | -| `-t` | [マルチスレッド](#threads) | `pdf2zh example.pdf -t 1` | -| `-o` | 出力ディレクトリ | `pdf2zh example.pdf -o output` | -| `-f`, `-c` | [例外](#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` | -| `--share` | [gradio公開リンクを取得] | `pdf2zh -i --share` | -| `--authorized` | [[ウェブ認証とカスタム認証ページの追加](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.)] | `pdf2zh -i --authorized users.txt [auth.html]` | -| `--prompt` | [カスタムビッグモデルのプロンプトを使用する] | `pdf2zh --prompt [prompt.txt]` | -| `--onnx` | [カスタムDocLayout-YOLO ONNXモデルの使用] | `pdf2zh --onnx [onnx/model/path]` | -| `--serverport` | [カスタムWebUIポートを使用する] | `pdf2zh --serverport 7860` | -| `--dir` | [batch translate] | `pdf2zh --dir /path/to/translate/` | -| `--config` | [configuration file](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#cofig) | `pdf2zh --config /path/to/config/config.json` | -| `--serverport` | [custom gradio server port] | `pdf2zh --serverport 7860` | - -

全文または部分的なドキュメント翻訳

- -- **全文翻訳** - -```bash -pdf2zh example.pdf -``` - -- **部分翻訳** - -```bash -pdf2zh example.pdf -p 1-3,5 -``` - -

ソース言語とターゲット言語を指定

- -[Google Languages Codes](https://developers.google.com/admin-sdk/directory/v1/languages)、[DeepL Languages Codes](https://developers.deepl.com/docs/resources/supported-languages) を参照してください - -```bash -pdf2zh example.pdf -li en -lo ja -``` - -

異なるサービスで翻訳

- -以下の表は、各翻訳サービスに必要な [環境変数](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4) を示しています。各サービスを使用する前に、これらの変数を設定してください。 - -|**Translator**|**Service**|**Environment Variables**|**Default Values**|**Notes**| -|-|-|-|-|-| -|**Google (Default)**|`google`|None|N/A|None| -|**Bing**|`bing`|None|N/A|None| -|**DeepL**|`deepl`|`DEEPL_AUTH_KEY`|`[Your Key]`|See [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API)| -|**DeepLX**|`deeplx`|`DEEPLX_ENDPOINT`|`https://api.deepl.com/translate`|See [DeepLX](https://github.com/OwO-Network/DeepLX)| -|**Ollama**|`ollama`|`OLLAMA_HOST`, `OLLAMA_MODEL`|`http://127.0.0.1:11434`, `gemma2`|See [Ollama](https://github.com/ollama/ollama)| -|**OpenAI**|`openai`|`OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL`|`https://api.openai.com/v1`, `[Your Key]`, `gpt-4o-mini`|See [OpenAI](https://platform.openai.com/docs/overview)| -|**AzureOpenAI**|`azure-openai`|`AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_MODEL`|`[Your Endpoint]`, `[Your Key]`, `gpt-4o-mini`|See [Azure OpenAI](https://learn.microsoft.com/zh-cn/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython&pivots=programming-language-python)| -|**Zhipu**|`zhipu`|`ZHIPU_API_KEY`, `ZHIPU_MODEL`|`[Your Key]`, `glm-4-flash`|See [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk)| -| **ModelScope** | `ModelScope` |`MODELSCOPE_API_KEY`, `MODELSCOPE_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-Coder-32B-Instruct`| See [ModelScope](https://www.modelscope.cn/docs/model-service/API-Inference/intro)| -|**Silicon**|`silicon`|`SILICON_API_KEY`, `SILICON_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-7B-Instruct`|See [SiliconCloud](https://docs.siliconflow.cn/quickstart)| -|**Gemini**|`gemini`|`GEMINI_API_KEY`, `GEMINI_MODEL`|`[Your Key]`, `gemini-1.5-flash`|See [Gemini](https://ai.google.dev/gemini-api/docs/openai)| -|**Azure**|`azure`|`AZURE_ENDPOINT`, `AZURE_API_KEY`|`https://api.translator.azure.cn`, `[Your Key]`|See [Azure](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview)| -|**Tencent**|`tencent`|`TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`|`[Your ID]`, `[Your Key]`|See [Tencent](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104)| -|**Dify**|`dify`|`DIFY_API_URL`, `DIFY_API_KEY`|`[Your DIFY URL]`, `[Your Key]`|See [Dify](https://github.com/langgenius/dify),Three variables, lang_out, lang_in, and text, need to be defined in Dify's workflow input.| -|**AnythingLLM**|`anythingllm`|`AnythingLLM_URL`, `AnythingLLM_APIKEY`|`[Your AnythingLLM URL]`, `[Your Key]`|See [anything-llm](https://github.com/Mintplex-Labs/anything-llm)| -|**Argos Translate**|`argos`| | |See [argos-translate](https://github.com/argosopentech/argos-translate)| -|**Grok**|`grok`| `GORK_API_KEY`, `GORK_MODEL` | `[Your GORK_API_KEY]`, `grok-2-1212` |See [Grok](https://docs.x.ai/docs/overview)| -|**DeepSeek**|`deepseek`| `DEEPSEEK_API_KEY`, `DEEPSEEK_MODEL` | `[Your DEEPSEEK_API_KEY]`, `deepseek-chat` |See [DeepSeek](https://www.deepseek.com/)| -|**OpenAI-Liked**|`openai-liked`| `OPENAILIKE_BASE_URL`, `OPENAILIKE_API_KEY`, `OPENAILIKE_MODEL` | `url`, `[Your Key]`, `model name` | None | - -(need Japenese translation) -For large language models that are compatible with the OpenAI API but not listed in the table above, you can set environment variables using the same method outlined for OpenAI in the table. - -`-s service` または `-s service:model` を使用してサービスを指定します: - -```bash -pdf2zh example.pdf -s openai:gpt-4o-mini -``` - -または環境変数でモデルを指定します: - -```bash -set OPENAI_MODEL=gpt-4o-mini -pdf2zh example.pdf -s openai -``` - -For PowerShell user: -```shell -$env:OPENAI_MODEL = gpt-4o-mini -pdf2zh example.pdf -s openai -``` - -

例外を指定して翻訳

- -正規表現を使用して保持する必要がある数式フォントと文字を指定します: - -```bash -pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])" -``` - -デフォルトで `Latex`、`Mono`、`Code`、`Italic`、`Symbol` および `Math` フォントを保持します: - -```bash -pdf2zh example.pdf -f "(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)" -``` - -

スレッド数を指定

- -`-t` を使用して翻訳に使用するスレッド数を指定します: - -```bash -pdf2zh example.pdf -t 1 -``` - -

カスタム プロンプト

- -`--prompt`を使用して、LLMで使用するプロンプトを指定します: - -```bash -pdf2zh example.pdf -pr prompt.txt -``` - - -`prompt.txt`の例: - -```txt -[ - { - "role": "system", - "content": "You are a professional,authentic machine translation engine.", - }, - { - "role": "user", - "content": "Translate the following markdown source text to ${lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: ${text}\nTranslated Text:", - }, -] -``` - - -カスタムプロンプトファイルでは、以下の3つの変数が使用できます。 - -|**変数**|**内容**| -|-|-| -|`lang_in`|ソース言語| -|`lang_out`|ターゲット言語| -|`text`|翻訳するテキスト| - -

API

- -### Python - -```python -from pdf2zh import translate, translate_stream - -params = {"lang_in": "en", "lang_out": "zh", "service": "google", "thread": 4} -file_mono, file_dual = translate(files=["example.pdf"], **params)[0] -with open("example.pdf", "rb") as f: - stream_mono, stream_dual = translate_stream(stream=f.read(), **params) -``` - -### HTTP - -```bash -pip install pdf2zh[backend] -pdf2zh --flask -pdf2zh --celery worker -``` - -```bash -curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"lang_out\":\"zh\",\"service\":\"google\",\"thread\":4}" -{"id":"d9894125-2f4e-45ea-9d93-1a9068d2045a"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -{"info":{"n":13,"total":506},"state":"PROGRESS"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -{"state":"SUCCESS"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE -``` - -

謝辞

- -- ドキュメントのマージ:[PyMuPDF](https://github.com/pymupdf/PyMuPDF) - -- ドキュメントの解析:[Pdfminer.six](https://github.com/pdfminer/pdfminer.six) - -- ドキュメントの抽出:[MinerU](https://github.com/opendatalab/MinerU) - -- ドキュメントプレビュー:[Gradio PDF](https://github.com/freddyaboulton/gradio-pdf) - -- マルチスレッド翻訳:[MathTranslate](https://github.com/SUSYUSTC/MathTranslate) - -- レイアウト解析:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) - -- ドキュメント標準:[PDF Explained](https://zxyle.github.io/PDF-Explained/)、[PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/) - -- 多言語フォント:[Go Noto Universal](https://github.com/satbyy/go-noto-universal) - -

貢献者

- - - - - -![Alt](https://repobeats.axiom.co/api/embed/dfa7583da5332a11468d686fbd29b92320a6a869.svg "Repobeats analytics image") - -

スター履歴

- - - - - - Star History Chart - - +
+ +[English](../README.md) | [简体中文](README_zh-CN.md) | [繁體中文](README_zh-TW.md) | 日本語 + +PDF2ZH + +

PDFMathTranslate

+ +

+ + + + + + + + + + + + + + + + + + + + +

+ +Byaidu%2FPDFMathTranslate | Trendshift + +
+ +科学 PDF 文書の翻訳およびバイリンガル比較ツール + +- 📊 数式、チャート、目次、注釈を保持 *([プレビュー](#preview))* +- 🌐 [複数の言語](#language) と [多様な翻訳サービス](#services) をサポート +- 🤖 [コマンドラインツール](#usage)、[インタラクティブユーザーインターフェース](#gui)、および [Docker](#docker) を提供 + +フィードバックは [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues)、[Telegram グループ](https://t.me/+Z9_SgnxmsmA5NzBl) または [QQ グループ](https://qm.qq.com/q/DixZCxQej0) でお気軽にどうぞ + +

最近の更新

+ +- [2024年11月26日] CLIがオンラインファイルをサポートするようになりました *(by [@reycn](https://github.com/reycn))* +- [2024年11月24日] 依存関係のサイズを削減するために [ONNX](https://github.com/onnx/onnx) サポートを追加しました *(by [@Wybxc](https://github.com/Wybxc))* +- [2024年11月23日] 🌟 [公共サービス](#demo) がオンラインになりました! *(by [@Byaidu](https://github.com/Byaidu))* +- [2024年11月23日] ウェブボットを防ぐためのファイアウォールを追加しました *(by [@Byaidu](https://github.com/Byaidu))* +- [2024年11月22日] GUIがイタリア語をサポートし、改善されました *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))* +- [2024年11月22日] デプロイされたサービスを他の人と共有できるようになりました *(by [@Zxis233](https://github.com/Zxis233))* +- [2024年11月22日] Tencent翻訳をサポートしました *(by [@hellofinch](https://github.com/hellofinch))* +- [2024年11月21日] GUIがバイリンガルドキュメントのダウンロードをサポートするようになりました *(by [@reycn](https://github.com/reycn))* +- [2024年11月20日] 🌟 [デモ](#demo) がオンラインになりました! *(by [@reycn](https://github.com/reycn))* + +

プレビュー

+ +
+ +
+ +

公共サービス 🌟

+ +### 無料サービス () + +インストールなしで [公共サービス](https://pdf2zh.com/) をオンラインで試すことができます。 + +### デモ + +インストールなしで [HuggingFace上のデモ](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker), [ModelScope上のデモ](https://www.modelscope.cn/studios/AI-ModelScope/PDFMathTranslate) を試すことができます。 +デモの計算リソースは限られているため、乱用しないようにしてください。 + +

インストールと使用方法

+ +このプロジェクトを使用するための4つの方法を提供しています:[コマンドライン](#cmd)、[ポータブル](#portable)、[GUI](#gui)、および [Docker](#docker)。 + +pdf2zhの実行には追加モデル(`wybxc/DocLayout-YOLO-DocStructBench-onnx`)が必要です。このモデルはModelScopeでも見つけることができます。起動時にこのモデルのダウンロードに問題がある場合は、以下の環境変数を使用してください: + +```shell +set HF_ENDPOINT=https://hf-mirror.com +``` + +For PowerShell user: +```shell +$env:HF_ENDPOINT = https://hf-mirror.com +``` + +

方法1. コマンドライン

+ + 1. Pythonがインストールされていること (バージョン3.8 <= バージョン <= 3.12) + 2. パッケージをインストールします: + + ```bash + pip install pdf2zh + ``` + + 3. 翻訳を実行し、[現在の作業ディレクトリ](https://chatgpt.com/share/6745ed36-9acc-800e-8a90-59204bd13444) にファイルを生成します: + + ```bash + pdf2zh document.pdf + ``` + +

方法2. ポータブル

+ +Python環境を事前にインストールする必要はありません + +[setup.bat](https://raw.githubusercontent.com/Byaidu/PDFMathTranslate/refs/heads/main/script/setup.bat) をダウンロードしてダブルクリックして実行します + +

方法3. GUI

+ +1. Pythonがインストールされていること (バージョン3.8 <= バージョン <= 3.12) +2. パッケージをインストールします: + + ```bash + pip install pdf2zh + ``` + +3. ブラウザで使用を開始します: + + ```bash + pdf2zh -i + ``` + +4. ブラウザが自動的に起動しない場合は、次のURLを開きます: + + ```bash + http://localhost:7860/ + ``` + + + +詳細については、[GUIのドキュメント](./README_GUI.md) を参照してください。 + +

方法4. Docker

+ +1. プルして実行します: + + ```bash + docker pull byaidu/pdf2zh + docker run -d -p 7860:7860 byaidu/pdf2zh + ``` + +2. ブラウザで開きます: + + ``` + http://localhost:7860/ + ``` + +クラウドサービスでのDockerデプロイメント用: + +
+ + Deploy + + Deploy to Koyeb + + Deploy on Zeabur + + Deploy to Koyeb +
+ +

高度なオプション

+ +コマンドラインで翻訳コマンドを実行し、現在の作業ディレクトリに翻訳されたドキュメント `example-mono.pdf` とバイリンガルドキュメント `example-dual.pdf` を生成します。デフォルトではGoogle翻訳サービスを使用します。More support translation services can find [HERE](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#services). + + +cmd + +以下の表に、参考のためにすべての高度なオプションをリストしました: + +| オプション | 機能 | 例 | +| -------- | ------- |------- | +| files | ローカルファイル | `pdf2zh ~/local.pdf` | +| links | オンラインファイル | `pdf2zh http://arxiv.org/paper.pdf` | +| `-i` | [GUIに入る](#gui) | `pdf2zh -i` | +| `-p` | [部分的なドキュメント翻訳](#partial) | `pdf2zh example.pdf -p 1` | +| `-li` | [ソース言語](#languages) | `pdf2zh example.pdf -li en` | +| `-lo` | [ターゲット言語](#languages) | `pdf2zh example.pdf -lo zh` | +| `-s` | [翻訳サービス](#services) | `pdf2zh example.pdf -s deepl` | +| `-t` | [マルチスレッド](#threads) | `pdf2zh example.pdf -t 1` | +| `-o` | 出力ディレクトリ | `pdf2zh example.pdf -o output` | +| `-f`, `-c` | [例外](#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` | +| `--share` | [gradio公開リンクを取得] | `pdf2zh -i --share` | +| `--authorized` | [[ウェブ認証とカスタム認証ページの追加](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.)] | `pdf2zh -i --authorized users.txt [auth.html]` | +| `--prompt` | [カスタムビッグモデルのプロンプトを使用する] | `pdf2zh --prompt [prompt.txt]` | +| `--onnx` | [カスタムDocLayout-YOLO ONNXモデルの使用] | `pdf2zh --onnx [onnx/model/path]` | +| `--serverport` | [カスタムWebUIポートを使用する] | `pdf2zh --serverport 7860` | +| `--dir` | [batch translate] | `pdf2zh --dir /path/to/translate/` | +| `--config` | [configuration file](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#cofig) | `pdf2zh --config /path/to/config/config.json` | +| `--serverport` | [custom gradio server port] | `pdf2zh --serverport 7860` | + +

全文または部分的なドキュメント翻訳

+ +- **全文翻訳** + +```bash +pdf2zh example.pdf +``` + +- **部分翻訳** + +```bash +pdf2zh example.pdf -p 1-3,5 +``` + +

ソース言語とターゲット言語を指定

+ +[Google Languages Codes](https://developers.google.com/admin-sdk/directory/v1/languages)、[DeepL Languages Codes](https://developers.deepl.com/docs/resources/supported-languages) を参照してください + +```bash +pdf2zh example.pdf -li en -lo ja +``` + +

異なるサービスで翻訳

+ +以下の表は、各翻訳サービスに必要な [環境変数](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4) を示しています。各サービスを使用する前に、これらの変数を設定してください。 + +|**Translator**|**Service**|**Environment Variables**|**Default Values**|**Notes**| +|-|-|-|-|-| +|**Google (Default)**|`google`|None|N/A|None| +|**Bing**|`bing`|None|N/A|None| +|**DeepL**|`deepl`|`DEEPL_AUTH_KEY`|`[Your Key]`|See [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API)| +|**DeepLX**|`deeplx`|`DEEPLX_ENDPOINT`|`https://api.deepl.com/translate`|See [DeepLX](https://github.com/OwO-Network/DeepLX)| +|**Ollama**|`ollama`|`OLLAMA_HOST`, `OLLAMA_MODEL`|`http://127.0.0.1:11434`, `gemma2`|See [Ollama](https://github.com/ollama/ollama)| +|**OpenAI**|`openai`|`OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL`|`https://api.openai.com/v1`, `[Your Key]`, `gpt-4o-mini`|See [OpenAI](https://platform.openai.com/docs/overview)| +|**AzureOpenAI**|`azure-openai`|`AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_MODEL`|`[Your Endpoint]`, `[Your Key]`, `gpt-4o-mini`|See [Azure OpenAI](https://learn.microsoft.com/zh-cn/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython&pivots=programming-language-python)| +|**Zhipu**|`zhipu`|`ZHIPU_API_KEY`, `ZHIPU_MODEL`|`[Your Key]`, `glm-4-flash`|See [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk)| +| **ModelScope** | `ModelScope` |`MODELSCOPE_API_KEY`, `MODELSCOPE_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-Coder-32B-Instruct`| See [ModelScope](https://www.modelscope.cn/docs/model-service/API-Inference/intro)| +|**Silicon**|`silicon`|`SILICON_API_KEY`, `SILICON_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-7B-Instruct`|See [SiliconCloud](https://docs.siliconflow.cn/quickstart)| +|**Gemini**|`gemini`|`GEMINI_API_KEY`, `GEMINI_MODEL`|`[Your Key]`, `gemini-1.5-flash`|See [Gemini](https://ai.google.dev/gemini-api/docs/openai)| +|**Azure**|`azure`|`AZURE_ENDPOINT`, `AZURE_API_KEY`|`https://api.translator.azure.cn`, `[Your Key]`|See [Azure](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview)| +|**Tencent**|`tencent`|`TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`|`[Your ID]`, `[Your Key]`|See [Tencent](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104)| +|**Dify**|`dify`|`DIFY_API_URL`, `DIFY_API_KEY`|`[Your DIFY URL]`, `[Your Key]`|See [Dify](https://github.com/langgenius/dify),Three variables, lang_out, lang_in, and text, need to be defined in Dify's workflow input.| +|**AnythingLLM**|`anythingllm`|`AnythingLLM_URL`, `AnythingLLM_APIKEY`|`[Your AnythingLLM URL]`, `[Your Key]`|See [anything-llm](https://github.com/Mintplex-Labs/anything-llm)| +|**Argos Translate**|`argos`| | |See [argos-translate](https://github.com/argosopentech/argos-translate)| +|**Grok**|`grok`| `GORK_API_KEY`, `GORK_MODEL` | `[Your GORK_API_KEY]`, `grok-2-1212` |See [Grok](https://docs.x.ai/docs/overview)| +|**DeepSeek**|`deepseek`| `DEEPSEEK_API_KEY`, `DEEPSEEK_MODEL` | `[Your DEEPSEEK_API_KEY]`, `deepseek-chat` |See [DeepSeek](https://www.deepseek.com/)| +|**OpenAI-Liked**|`openai-liked`| `OPENAILIKE_BASE_URL`, `OPENAILIKE_API_KEY`, `OPENAILIKE_MODEL` | `url`, `[Your Key]`, `model name` | None | + +(need Japenese translation) +For large language models that are compatible with the OpenAI API but not listed in the table above, you can set environment variables using the same method outlined for OpenAI in the table. + +`-s service` または `-s service:model` を使用してサービスを指定します: + +```bash +pdf2zh example.pdf -s openai:gpt-4o-mini +``` + +または環境変数でモデルを指定します: + +```bash +set OPENAI_MODEL=gpt-4o-mini +pdf2zh example.pdf -s openai +``` + +For PowerShell user: +```shell +$env:OPENAI_MODEL = gpt-4o-mini +pdf2zh example.pdf -s openai +``` + +

例外を指定して翻訳

+ +正規表現を使用して保持する必要がある数式フォントと文字を指定します: + +```bash +pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])" +``` + +デフォルトで `Latex`、`Mono`、`Code`、`Italic`、`Symbol` および `Math` フォントを保持します: + +```bash +pdf2zh example.pdf -f "(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)" +``` + +

スレッド数を指定

+ +`-t` を使用して翻訳に使用するスレッド数を指定します: + +```bash +pdf2zh example.pdf -t 1 +``` + +

カスタム プロンプト

+ +`--prompt`を使用して、LLMで使用するプロンプトを指定します: + +```bash +pdf2zh example.pdf -pr prompt.txt +``` + + +`prompt.txt`の例: + +```txt +[ + { + "role": "system", + "content": "You are a professional,authentic machine translation engine.", + }, + { + "role": "user", + "content": "Translate the following markdown source text to ${lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: ${text}\nTranslated Text:", + }, +] +``` + + +カスタムプロンプトファイルでは、以下の3つの変数が使用できます。 + +|**変数**|**内容**| +|-|-| +|`lang_in`|ソース言語| +|`lang_out`|ターゲット言語| +|`text`|翻訳するテキスト| + +

API

+ +### Python + +```python +from pdf2zh import translate, translate_stream + +params = {"lang_in": "en", "lang_out": "zh", "service": "google", "thread": 4} +file_mono, file_dual = translate(files=["example.pdf"], **params)[0] +with open("example.pdf", "rb") as f: + stream_mono, stream_dual = translate_stream(stream=f.read(), **params) +``` + +### HTTP + +```bash +pip install pdf2zh[backend] +pdf2zh --flask +pdf2zh --celery worker +``` + +```bash +curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"lang_out\":\"zh\",\"service\":\"google\",\"thread\":4}" +{"id":"d9894125-2f4e-45ea-9d93-1a9068d2045a"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a +{"info":{"n":13,"total":506},"state":"PROGRESS"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a +{"state":"SUCCESS"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE +``` + +

謝辞

+ +- ドキュメントのマージ:[PyMuPDF](https://github.com/pymupdf/PyMuPDF) + +- ドキュメントの解析:[Pdfminer.six](https://github.com/pdfminer/pdfminer.six) + +- ドキュメントの抽出:[MinerU](https://github.com/opendatalab/MinerU) + +- ドキュメントプレビュー:[Gradio PDF](https://github.com/freddyaboulton/gradio-pdf) + +- マルチスレッド翻訳:[MathTranslate](https://github.com/SUSYUSTC/MathTranslate) + +- レイアウト解析:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) + +- ドキュメント標準:[PDF Explained](https://zxyle.github.io/PDF-Explained/)、[PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/) + +- 多言語フォント:[Go Noto Universal](https://github.com/satbyy/go-noto-universal) + +

貢献者

+ + + + + +![Alt](https://repobeats.axiom.co/api/embed/dfa7583da5332a11468d686fbd29b92320a6a869.svg "Repobeats analytics image") + +

スター履歴

+ + + + + + Star History Chart + + diff --git a/docs/README_zh-CN.md b/docs/README_zh-CN.md index 3622467f..2b7d1b40 100644 --- a/docs/README_zh-CN.md +++ b/docs/README_zh-CN.md @@ -1,388 +1,388 @@ -
- -[English](../README.md) | 简体中文 | [繁體中文](README_zh-TW.md) | [日本語](README_ja-JP.md) - -PDF2ZH - -

PDFMathTranslate

- -

- - - - - - - - - - - - - - - - - - - - -

- -Byaidu%2FPDFMathTranslate | Trendshift - -
- -科学 PDF 文档翻译及双语对照工具 - -- 📊 保留公式、图表、目录和注释 *([预览效果](#preview))* -- 🌐 支持 [多种语言](#language) 和 [诸多翻译服务](#services) -- 🤖 提供 [命令行工具](#usage),[图形交互界面](#gui),以及 [容器化部署](#docker) - -欢迎在 [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues)、[Telegram 用户群](https://t.me/+Z9_SgnxmsmA5NzBl) 或 [QQ 用户群](https://qm.qq.com/q/DixZCxQej0) 中提供反馈 - -有关如何贡献的详细信息,请查阅 [贡献指南](https://github.com/Byaidu/PDFMathTranslate/wiki/Contribution-Guide---%E8%B4%A1%E7%8C%AE%E6%8C%87%E5%8D%97) - -

近期更新

- -- [Dec. 24 2024] 翻译功能支持接入 [Xinference](https://github.com/xorbitsai/inference) 运行的本地 LLM _(by [@imClumsyPanda](https://github.com/imClumsyPanda))_ -- [Nov. 26 2024] CLI 现在已支持(多个)在线 PDF 文件 *(by [@reycn](https://github.com/reycn))* -- [Nov. 24 2024] 为降低依赖大小,提供 [ONNX](https://github.com/onnx/onnx) 支持 *(by [@Wybxc](https://github.com/Wybxc))* -- [Nov. 23 2024] 🌟 [免费公共服务](#demo) 上线! *(by [@Byaidu](https://github.com/Byaidu))* -- [Nov. 23 2024] 防止网页爬虫的防火墙 *(by [@Byaidu](https://github.com/Byaidu))* -- [Nov. 22 2024] 图形用户界面现已支持意大利语,并获得了一些更新 *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))* -- [Nov. 22 2024] 现在你可以将自己部署的服务分享给朋友了 *(by [@Zxis233](https://github.com/Zxis233))* -- [Nov. 22 2024] 支持腾讯翻译 *(by [@hellofinch](https://github.com/hellofinch))* -- [Nov. 21 2024] 图形用户界面现在支持下载双语文档 *(by [@reycn](https://github.com/reycn))* -- [Nov. 20 2024] 🌟 提供了 [在线演示](#demo)! *(by [@reycn](https://github.com/reycn))* - -

效果预览

- -
- -
- -

在线演示 🌟

- -### 免费服务 () - -你可以立即尝试 [免费公共服务](https://pdf2zh.com/) 而无需安装 - -### 在线演示 - -你可以立即尝试 [在 HuggingFace 上的在线演示](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker)和[魔搭的在线演示](https://www.modelscope.cn/studios/AI-ModelScope/PDFMathTranslate)而无需安装 -请注意,演示的计算资源有限,因此请避免滥用 - -

安装和使用

- -我们提供了四种使用该项目的方法:[命令行工具](#cmd)、[便携式安装](#portable)、[图形交互界面](#gui) 和 [容器化部署](#docker). - -pdf2zh的运行依赖于额外模型(`wybxc/DocLayout-YOLO-DocStructBench-onnx`),该模型在魔搭上也可以找到。如果你在启动时下载该模型遇到问题,请使用如下环境变量: -```shell -set HF_ENDPOINT=https://hf-mirror.com -``` - -如使用 PowerShell,请使用如下方法设置环境变量: -```shell -$env:HF_ENDPOINT = https://hf-mirror.com -``` - -

方法一、命令行工具

- - 1. 确保安装了版本大于 3.8 且小于 3.12 的 Python - 2. 安装此程序: - - ```bash - pip install pdf2zh - ``` - - 3. 执行翻译,生成文件位于 [当前工作目录](https://chatgpt.com/share/6745ed36-9acc-800e-8a90-59204bd13444): - - ```bash - pdf2zh document.pdf - ``` - -

方法二、便携式安装

- -无需预先安装 Python 环境 - -下载 [setup.bat](https://raw.githubusercontent.com/Byaidu/PDFMathTranslate/refs/heads/main/script/setup.bat) 并双击运行 - -

方法三、图形交互界面

- -1. 确保安装了版本大于 3.8 且小于 3.12 的 Python -2. 安装此程序: - - ```bash - pip install pdf2zh - ``` - -3. 开始在浏览器中使用: - - ```bash - pdf2zh -i - ``` - -4. 如果您的浏览器没有自动启动并跳转,请用浏览器打开: - - ```bash - http://localhost:7860/ - ``` - - - -查看 [documentation for GUI](/README_GUI.md) 获取细节说明 - -

方法四、容器化部署

- -1. 拉取 Docker 镜像并运行: - - ```bash - docker pull byaidu/pdf2zh - docker run -d -p 7860:7860 byaidu/pdf2zh - ``` - -2. 通过浏览器打开: - - ``` - http://localhost:7860/ - ``` - -用于在云服务上部署容器镜像: - -
- - Deploy - - Deploy to Koyeb - - Deploy on Zeabur - - Deploy to Koyeb -
- -

高级选项

- -在命令行中执行翻译命令,在当前工作目录下生成译文文档 `example-mono.pdf` 和双语对照文档 `example-dual.pdf`,默认使用 Google 翻译服务,更多支持的服务在[这里](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#services))。 - -cmd - -我们在下表中列出了所有高级选项,以供参考: - -| Option | Function | Example | -| -------- | ------- |------- | -| files | 本地文件 | `pdf2zh ~/local.pdf` | -| links | 在线文件 | `pdf2zh http://arxiv.org/paper.pdf` | -| `-i` | [进入图形界面](#gui) | `pdf2zh -i` | -| `-p` | [仅翻译部分文档](#partial) | `pdf2zh example.pdf -p 1` | -| `-li` | [源语言](#languages) | `pdf2zh example.pdf -li en` | -| `-lo` | [目标语言](#languages) | `pdf2zh example.pdf -lo zh` | -| `-s` | [指定翻译服务](#services) | `pdf2zh example.pdf -s deepl` | -| `-t` | [多线程](#threads) | `pdf2zh example.pdf -t 1` | -| `-o` | 输出目录 | `pdf2zh example.pdf -o output` | -| `-f`, `-c` | [例外规则](#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` | -| `--share` | [获取 gradio 公开链接] | `pdf2zh -i --share` | -| `--authorized` | [[添加网页认证和自定义认证页](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.)] | `pdf2zh -i --authorized users.txt [auth.html]` | -| `--prompt` | [使用自定义的大模型prompt] | `pdf2zh --prompt [prompt.txt]` | -| `--onnx` | [使用自定义的 DocLayout-YOLO ONNX 模型] | `pdf2zh --onnx [onnx/model/path]` | -| `--serverport` | [使用自定义的 WebUI 端口] | `pdf2zh --serverport 7860` | -| `--dir` | [文件夹翻译] | `pdf2zh --dir /path/to/translate/` | -| `--serverport` | [自定义端口号] | `pdf2zh --serverport 7860` | -| `--config` | [持久化定义配置文件](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#cofig) | `pdf2zh --config /path/to/config/config.json` | - - -

全文或部分文档翻译

- -- **全文翻译** - -```bash -pdf2zh example.pdf -``` - -- **部分翻译** - -```bash -pdf2zh example.pdf -p 1-3,5 -``` - -

指定源语言和目标语言

- -参考 [Google Languages Codes](https://developers.google.com/admin-sdk/directory/v1/languages), [DeepL Languages Codes](https://developers.deepl.com/docs/resources/supported-languages) - -```bash -pdf2zh example.pdf -li en -lo ja -``` - -

使用不同的翻译服务

- -下表列出了每个翻译服务所需的 [环境变量](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4),在使用相应服务之前,请确保已设置这些变量 - -|**Translator**|**Service**|**Environment Variables**|**Default Values**|**Notes**| -|-|-|-|-|-| -|**Google (Default)**|`google`|None|N/A|None| -|**Bing**|`bing`|None|N/A|None| -|**DeepL**|`deepl`|`DEEPL_AUTH_KEY`|`[Your Key]`|See [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API)| -|**DeepLX**|`deeplx`|`DEEPLX_ENDPOINT`|`https://api.deepl.com/translate`|See [DeepLX](https://github.com/OwO-Network/DeepLX)| -|**Ollama**|`ollama`|`OLLAMA_HOST`, `OLLAMA_MODEL`|`http://127.0.0.1:11434`, `gemma2`|See [Ollama](https://github.com/ollama/ollama)| -|**OpenAI**|`openai`|`OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL`|`https://api.openai.com/v1`, `[Your Key]`, `gpt-4o-mini`|See [OpenAI](https://platform.openai.com/docs/overview)| -|**AzureOpenAI**|`azure-openai`|`AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_MODEL`|`[Your Endpoint]`, `[Your Key]`, `gpt-4o-mini`|See [Azure OpenAI](https://learn.microsoft.com/zh-cn/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython&pivots=programming-language-python)| -|**Zhipu**|`zhipu`|`ZHIPU_API_KEY`, `ZHIPU_MODEL`|`[Your Key]`, `glm-4-flash`|See [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk)| -| **ModelScope** | `ModelScope` |`MODELSCOPE_API_KEY`, `MODELSCOPE_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-Coder-32B-Instruct`| See [ModelScope](https://www.modelscope.cn/docs/model-service/API-Inference/intro)| -|**Silicon**|`silicon`|`SILICON_API_KEY`, `SILICON_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-7B-Instruct`|See [SiliconCloud](https://docs.siliconflow.cn/quickstart)| -|**Gemini**|`gemini`|`GEMINI_API_KEY`, `GEMINI_MODEL`|`[Your Key]`, `gemini-1.5-flash`|See [Gemini](https://ai.google.dev/gemini-api/docs/openai)| -|**Azure**|`azure`|`AZURE_ENDPOINT`, `AZURE_API_KEY`|`https://api.translator.azure.cn`, `[Your Key]`|See [Azure](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview)| -|**Tencent**|`tencent`|`TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`|`[Your ID]`, `[Your Key]`|See [Tencent](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104)| -|**Dify**|`dify`|`DIFY_API_URL`, `DIFY_API_KEY`|`[Your DIFY URL]`, `[Your Key]`|See [Dify](https://github.com/langgenius/dify),Three variables, lang_out, lang_in, and text, need to be defined in Dify's workflow input.| -|**AnythingLLM**|`anythingllm`|`AnythingLLM_URL`, `AnythingLLM_APIKEY`|`[Your AnythingLLM URL]`, `[Your Key]`|See [anything-llm](https://github.com/Mintplex-Labs/anything-llm)| -|**Argos Translate**|`argos`| | |See [argos-translate](https://github.com/argosopentech/argos-translate)| -|**Grok**|`grok`| `GORK_API_KEY`, `GORK_MODEL` | `[Your GORK_API_KEY]`, `grok-2-1212` |See [Grok](https://docs.x.ai/docs/overview)| -|**DeepSeek**|`deepseek`| `DEEPSEEK_API_KEY`, `DEEPSEEK_MODEL` | `[Your DEEPSEEK_API_KEY]`, `deepseek-chat` |See [DeepSeek](https://www.deepseek.com/)| -|**OpenAI-Liked**|`openai-liked`| `OPENAILIKE_BASE_URL`, `OPENAILIKE_API_KEY`, `OPENAILIKE_MODEL` | `url`, `[Your Key]`, `model name` | None | - -对于未在上述表格中的,并且兼容 OpenAI api 的大语言模型,可使用表格中的 OpenAI 的方式进行环境变量的设置。 - -使用 `-s service` 或 `-s service:model` 指定翻译服务: - -```bash -pdf2zh example.pdf -s openai:gpt-4o-mini -``` - -或者使用环境变量指定模型: - -```bash -set OPENAI_MODEL=gpt-4o-mini -pdf2zh example.pdf -s openai -``` - -对于 PowerShell 用户,请使用如下方式设置环境变量指定模型: -```shell -$env:OPENAI_MODEL = gpt-4o-mini -pdf2zh example.pdf -s openai -``` - -

指定例外规则

- -使用正则表达式指定需保留的公式字体与字符: - -```bash -pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])" -``` - -默认保留 `Latex`, `Mono`, `Code`, `Italic`, `Symbol` 以及 `Math` 字体: - -```bash -pdf2zh example.pdf -f "(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)" -``` - -

指定线程数量

- -使用 `-t` 指定翻译时使用的线程数量: - -```bash -pdf2zh example.pdf -t 1 -``` -

自定义大模型prompt

- -使用 `--prompt` 指定使用大模型翻译时使用的 Prompt 文件。 - -```bash -pdf2zh example.pdf -pr prompt.txt -``` - - -示例 `prompt.txt` 文件 - -``` -[ - { - "role": "system", - "content": "You are a professional,authentic machine translation engine.", - }, - { - "role": "user", - "content": "Translate the following markdown source text to ${lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: ${text}\nTranslated Text:", - }, -] -``` - - -自定义 Prompt 文件中,可以使用三个内置变量用来传递参数。 -|**变量名**|**说明**| -|-|-| -|`lang_in`|输入的语言| -|`lang_out`|输出的语言| -|`text`|需要翻译的文本| - -

API

- -### Python - -```python -from pdf2zh import translate, translate_stream - -params = {"lang_in": "en", "lang_out": "zh", "service": "google", "thread": 4} -file_mono, file_dual = translate(files=["example.pdf"], **params)[0] -with open("example.pdf", "rb") as f: - stream_mono, stream_dual = translate_stream(stream=f.read(), **params) -``` - -### HTTP - -```bash -pip install pdf2zh[backend] -pdf2zh --flask -pdf2zh --celery worker -``` - -```bash -curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"lang_out\":\"zh\",\"service\":\"google\",\"thread\":4}" -{"id":"d9894125-2f4e-45ea-9d93-1a9068d2045a"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -{"info":{"n":13,"total":506},"state":"PROGRESS"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -{"state":"SUCCESS"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE -``` - -

致谢

- -- 文档合并:[PyMuPDF](https://github.com/pymupdf/PyMuPDF) - -- 文档解析:[Pdfminer.six](https://github.com/pdfminer/pdfminer.six) - -- 文档提取:[MinerU](https://github.com/opendatalab/MinerU) - -- 文档预览:[Gradio PDF](https://github.com/freddyaboulton/gradio-pdf) - -- 多线程翻译:[MathTranslate](https://github.com/SUSYUSTC/MathTranslate) - -- 布局解析:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) - -- 文档标准:[PDF Explained](https://zxyle.github.io/PDF-Explained/), [PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/) - -- 多语言字体:[Go Noto Universal](https://github.com/satbyy/go-noto-universal) - -

贡献者

- - - - - -![Alt](https://repobeats.axiom.co/api/embed/dfa7583da5332a11468d686fbd29b92320a6a869.svg "Repobeats analytics image") - -

星标历史

- - - - - - Star History Chart - - +
+ +[English](../README.md) | 简体中文 | [繁體中文](README_zh-TW.md) | [日本語](README_ja-JP.md) + +PDF2ZH + +

PDFMathTranslate

+ +

+ + + + + + + + + + + + + + + + + + + + +

+ +Byaidu%2FPDFMathTranslate | Trendshift + +
+ +科学 PDF 文档翻译及双语对照工具 + +- 📊 保留公式、图表、目录和注释 *([预览效果](#preview))* +- 🌐 支持 [多种语言](#language) 和 [诸多翻译服务](#services) +- 🤖 提供 [命令行工具](#usage),[图形交互界面](#gui),以及 [容器化部署](#docker) + +欢迎在 [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues)、[Telegram 用户群](https://t.me/+Z9_SgnxmsmA5NzBl) 或 [QQ 用户群](https://qm.qq.com/q/DixZCxQej0) 中提供反馈 + +有关如何贡献的详细信息,请查阅 [贡献指南](https://github.com/Byaidu/PDFMathTranslate/wiki/Contribution-Guide---%E8%B4%A1%E7%8C%AE%E6%8C%87%E5%8D%97) + +

近期更新

+ +- [Dec. 24 2024] 翻译功能支持接入 [Xinference](https://github.com/xorbitsai/inference) 运行的本地 LLM _(by [@imClumsyPanda](https://github.com/imClumsyPanda))_ +- [Nov. 26 2024] CLI 现在已支持(多个)在线 PDF 文件 *(by [@reycn](https://github.com/reycn))* +- [Nov. 24 2024] 为降低依赖大小,提供 [ONNX](https://github.com/onnx/onnx) 支持 *(by [@Wybxc](https://github.com/Wybxc))* +- [Nov. 23 2024] 🌟 [免费公共服务](#demo) 上线! *(by [@Byaidu](https://github.com/Byaidu))* +- [Nov. 23 2024] 防止网页爬虫的防火墙 *(by [@Byaidu](https://github.com/Byaidu))* +- [Nov. 22 2024] 图形用户界面现已支持意大利语,并获得了一些更新 *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))* +- [Nov. 22 2024] 现在你可以将自己部署的服务分享给朋友了 *(by [@Zxis233](https://github.com/Zxis233))* +- [Nov. 22 2024] 支持腾讯翻译 *(by [@hellofinch](https://github.com/hellofinch))* +- [Nov. 21 2024] 图形用户界面现在支持下载双语文档 *(by [@reycn](https://github.com/reycn))* +- [Nov. 20 2024] 🌟 提供了 [在线演示](#demo)! *(by [@reycn](https://github.com/reycn))* + +

效果预览

+ +
+ +
+ +

在线演示 🌟

+ +### 免费服务 () + +你可以立即尝试 [免费公共服务](https://pdf2zh.com/) 而无需安装 + +### 在线演示 + +你可以立即尝试 [在 HuggingFace 上的在线演示](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker)和[魔搭的在线演示](https://www.modelscope.cn/studios/AI-ModelScope/PDFMathTranslate)而无需安装 +请注意,演示的计算资源有限,因此请避免滥用 + +

安装和使用

+ +我们提供了四种使用该项目的方法:[命令行工具](#cmd)、[便携式安装](#portable)、[图形交互界面](#gui) 和 [容器化部署](#docker). + +pdf2zh的运行依赖于额外模型(`wybxc/DocLayout-YOLO-DocStructBench-onnx`),该模型在魔搭上也可以找到。如果你在启动时下载该模型遇到问题,请使用如下环境变量: +```shell +set HF_ENDPOINT=https://hf-mirror.com +``` + +如使用 PowerShell,请使用如下方法设置环境变量: +```shell +$env:HF_ENDPOINT = https://hf-mirror.com +``` + +

方法一、命令行工具

+ + 1. 确保安装了版本大于 3.8 且小于 3.12 的 Python + 2. 安装此程序: + + ```bash + pip install pdf2zh + ``` + + 3. 执行翻译,生成文件位于 [当前工作目录](https://chatgpt.com/share/6745ed36-9acc-800e-8a90-59204bd13444): + + ```bash + pdf2zh document.pdf + ``` + +

方法二、便携式安装

+ +无需预先安装 Python 环境 + +下载 [setup.bat](https://raw.githubusercontent.com/Byaidu/PDFMathTranslate/refs/heads/main/script/setup.bat) 并双击运行 + +

方法三、图形交互界面

+ +1. 确保安装了版本大于 3.8 且小于 3.12 的 Python +2. 安装此程序: + + ```bash + pip install pdf2zh + ``` + +3. 开始在浏览器中使用: + + ```bash + pdf2zh -i + ``` + +4. 如果您的浏览器没有自动启动并跳转,请用浏览器打开: + + ```bash + http://localhost:7860/ + ``` + + + +查看 [documentation for GUI](/README_GUI.md) 获取细节说明 + +

方法四、容器化部署

+ +1. 拉取 Docker 镜像并运行: + + ```bash + docker pull byaidu/pdf2zh + docker run -d -p 7860:7860 byaidu/pdf2zh + ``` + +2. 通过浏览器打开: + + ``` + http://localhost:7860/ + ``` + +用于在云服务上部署容器镜像: + + + +

高级选项

+ +在命令行中执行翻译命令,在当前工作目录下生成译文文档 `example-mono.pdf` 和双语对照文档 `example-dual.pdf`,默认使用 Google 翻译服务,更多支持的服务在[这里](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#services))。 + +cmd + +我们在下表中列出了所有高级选项,以供参考: + +| Option | Function | Example | +| -------- | ------- |------- | +| files | 本地文件 | `pdf2zh ~/local.pdf` | +| links | 在线文件 | `pdf2zh http://arxiv.org/paper.pdf` | +| `-i` | [进入图形界面](#gui) | `pdf2zh -i` | +| `-p` | [仅翻译部分文档](#partial) | `pdf2zh example.pdf -p 1` | +| `-li` | [源语言](#languages) | `pdf2zh example.pdf -li en` | +| `-lo` | [目标语言](#languages) | `pdf2zh example.pdf -lo zh` | +| `-s` | [指定翻译服务](#services) | `pdf2zh example.pdf -s deepl` | +| `-t` | [多线程](#threads) | `pdf2zh example.pdf -t 1` | +| `-o` | 输出目录 | `pdf2zh example.pdf -o output` | +| `-f`, `-c` | [例外规则](#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` | +| `--share` | [获取 gradio 公开链接] | `pdf2zh -i --share` | +| `--authorized` | [[添加网页认证和自定义认证页](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.)] | `pdf2zh -i --authorized users.txt [auth.html]` | +| `--prompt` | [使用自定义的大模型prompt] | `pdf2zh --prompt [prompt.txt]` | +| `--onnx` | [使用自定义的 DocLayout-YOLO ONNX 模型] | `pdf2zh --onnx [onnx/model/path]` | +| `--serverport` | [使用自定义的 WebUI 端口] | `pdf2zh --serverport 7860` | +| `--dir` | [文件夹翻译] | `pdf2zh --dir /path/to/translate/` | +| `--serverport` | [自定义端口号] | `pdf2zh --serverport 7860` | +| `--config` | [持久化定义配置文件](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.md#cofig) | `pdf2zh --config /path/to/config/config.json` | + + +

全文或部分文档翻译

+ +- **全文翻译** + +```bash +pdf2zh example.pdf +``` + +- **部分翻译** + +```bash +pdf2zh example.pdf -p 1-3,5 +``` + +

指定源语言和目标语言

+ +参考 [Google Languages Codes](https://developers.google.com/admin-sdk/directory/v1/languages), [DeepL Languages Codes](https://developers.deepl.com/docs/resources/supported-languages) + +```bash +pdf2zh example.pdf -li en -lo ja +``` + +

使用不同的翻译服务

+ +下表列出了每个翻译服务所需的 [环境变量](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4),在使用相应服务之前,请确保已设置这些变量 + +|**Translator**|**Service**|**Environment Variables**|**Default Values**|**Notes**| +|-|-|-|-|-| +|**Google (Default)**|`google`|None|N/A|None| +|**Bing**|`bing`|None|N/A|None| +|**DeepL**|`deepl`|`DEEPL_AUTH_KEY`|`[Your Key]`|See [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API)| +|**DeepLX**|`deeplx`|`DEEPLX_ENDPOINT`|`https://api.deepl.com/translate`|See [DeepLX](https://github.com/OwO-Network/DeepLX)| +|**Ollama**|`ollama`|`OLLAMA_HOST`, `OLLAMA_MODEL`|`http://127.0.0.1:11434`, `gemma2`|See [Ollama](https://github.com/ollama/ollama)| +|**OpenAI**|`openai`|`OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL`|`https://api.openai.com/v1`, `[Your Key]`, `gpt-4o-mini`|See [OpenAI](https://platform.openai.com/docs/overview)| +|**AzureOpenAI**|`azure-openai`|`AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_MODEL`|`[Your Endpoint]`, `[Your Key]`, `gpt-4o-mini`|See [Azure OpenAI](https://learn.microsoft.com/zh-cn/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython&pivots=programming-language-python)| +|**Zhipu**|`zhipu`|`ZHIPU_API_KEY`, `ZHIPU_MODEL`|`[Your Key]`, `glm-4-flash`|See [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk)| +| **ModelScope** | `ModelScope` |`MODELSCOPE_API_KEY`, `MODELSCOPE_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-Coder-32B-Instruct`| See [ModelScope](https://www.modelscope.cn/docs/model-service/API-Inference/intro)| +|**Silicon**|`silicon`|`SILICON_API_KEY`, `SILICON_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-7B-Instruct`|See [SiliconCloud](https://docs.siliconflow.cn/quickstart)| +|**Gemini**|`gemini`|`GEMINI_API_KEY`, `GEMINI_MODEL`|`[Your Key]`, `gemini-1.5-flash`|See [Gemini](https://ai.google.dev/gemini-api/docs/openai)| +|**Azure**|`azure`|`AZURE_ENDPOINT`, `AZURE_API_KEY`|`https://api.translator.azure.cn`, `[Your Key]`|See [Azure](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview)| +|**Tencent**|`tencent`|`TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`|`[Your ID]`, `[Your Key]`|See [Tencent](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104)| +|**Dify**|`dify`|`DIFY_API_URL`, `DIFY_API_KEY`|`[Your DIFY URL]`, `[Your Key]`|See [Dify](https://github.com/langgenius/dify),Three variables, lang_out, lang_in, and text, need to be defined in Dify's workflow input.| +|**AnythingLLM**|`anythingllm`|`AnythingLLM_URL`, `AnythingLLM_APIKEY`|`[Your AnythingLLM URL]`, `[Your Key]`|See [anything-llm](https://github.com/Mintplex-Labs/anything-llm)| +|**Argos Translate**|`argos`| | |See [argos-translate](https://github.com/argosopentech/argos-translate)| +|**Grok**|`grok`| `GORK_API_KEY`, `GORK_MODEL` | `[Your GORK_API_KEY]`, `grok-2-1212` |See [Grok](https://docs.x.ai/docs/overview)| +|**DeepSeek**|`deepseek`| `DEEPSEEK_API_KEY`, `DEEPSEEK_MODEL` | `[Your DEEPSEEK_API_KEY]`, `deepseek-chat` |See [DeepSeek](https://www.deepseek.com/)| +|**OpenAI-Liked**|`openai-liked`| `OPENAILIKE_BASE_URL`, `OPENAILIKE_API_KEY`, `OPENAILIKE_MODEL` | `url`, `[Your Key]`, `model name` | None | + +对于未在上述表格中的,并且兼容 OpenAI api 的大语言模型,可使用表格中的 OpenAI 的方式进行环境变量的设置。 + +使用 `-s service` 或 `-s service:model` 指定翻译服务: + +```bash +pdf2zh example.pdf -s openai:gpt-4o-mini +``` + +或者使用环境变量指定模型: + +```bash +set OPENAI_MODEL=gpt-4o-mini +pdf2zh example.pdf -s openai +``` + +对于 PowerShell 用户,请使用如下方式设置环境变量指定模型: +```shell +$env:OPENAI_MODEL = gpt-4o-mini +pdf2zh example.pdf -s openai +``` + +

指定例外规则

+ +使用正则表达式指定需保留的公式字体与字符: + +```bash +pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])" +``` + +默认保留 `Latex`, `Mono`, `Code`, `Italic`, `Symbol` 以及 `Math` 字体: + +```bash +pdf2zh example.pdf -f "(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)" +``` + +

指定线程数量

+ +使用 `-t` 指定翻译时使用的线程数量: + +```bash +pdf2zh example.pdf -t 1 +``` +

自定义大模型prompt

+ +使用 `--prompt` 指定使用大模型翻译时使用的 Prompt 文件。 + +```bash +pdf2zh example.pdf -pr prompt.txt +``` + + +示例 `prompt.txt` 文件 + +``` +[ + { + "role": "system", + "content": "You are a professional,authentic machine translation engine.", + }, + { + "role": "user", + "content": "Translate the following markdown source text to ${lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: ${text}\nTranslated Text:", + }, +] +``` + + +自定义 Prompt 文件中,可以使用三个内置变量用来传递参数。 +|**变量名**|**说明**| +|-|-| +|`lang_in`|输入的语言| +|`lang_out`|输出的语言| +|`text`|需要翻译的文本| + +

API

+ +### Python + +```python +from pdf2zh import translate, translate_stream + +params = {"lang_in": "en", "lang_out": "zh", "service": "google", "thread": 4} +file_mono, file_dual = translate(files=["example.pdf"], **params)[0] +with open("example.pdf", "rb") as f: + stream_mono, stream_dual = translate_stream(stream=f.read(), **params) +``` + +### HTTP + +```bash +pip install pdf2zh[backend] +pdf2zh --flask +pdf2zh --celery worker +``` + +```bash +curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"lang_out\":\"zh\",\"service\":\"google\",\"thread\":4}" +{"id":"d9894125-2f4e-45ea-9d93-1a9068d2045a"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a +{"info":{"n":13,"total":506},"state":"PROGRESS"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a +{"state":"SUCCESS"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE +``` + +

致谢

+ +- 文档合并:[PyMuPDF](https://github.com/pymupdf/PyMuPDF) + +- 文档解析:[Pdfminer.six](https://github.com/pdfminer/pdfminer.six) + +- 文档提取:[MinerU](https://github.com/opendatalab/MinerU) + +- 文档预览:[Gradio PDF](https://github.com/freddyaboulton/gradio-pdf) + +- 多线程翻译:[MathTranslate](https://github.com/SUSYUSTC/MathTranslate) + +- 布局解析:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) + +- 文档标准:[PDF Explained](https://zxyle.github.io/PDF-Explained/), [PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/) + +- 多语言字体:[Go Noto Universal](https://github.com/satbyy/go-noto-universal) + +

贡献者

+ + + + + +![Alt](https://repobeats.axiom.co/api/embed/dfa7583da5332a11468d686fbd29b92320a6a869.svg "Repobeats analytics image") + +

星标历史

+ + + + + + Star History Chart + + diff --git a/docs/README_zh-TW.md b/docs/README_zh-TW.md index 5480dc01..b3b233d0 100644 --- a/docs/README_zh-TW.md +++ b/docs/README_zh-TW.md @@ -1,366 +1,366 @@ -
- -[English](../README.md) | [简体中文](README_zh-CN.md) | 繁體中文 | [日本語](README_ja-JP.md) - -PDF2ZH - -

PDFMathTranslate

- -

- - - - - - - - - - - - - - - - - - - - -

- -Byaidu%2FPDFMathTranslate | Trendshift - -
- -科學 PDF 文件翻譯及雙語對照工具 - -- 📊 保留公式、圖表、目錄和註釋 *([預覽效果](#preview))* -- 🌐 支援 [多種語言](#language) 和 [諸多翻譯服務](#services) -- 🤖 提供 [命令列工具](#usage)、[圖形使用者介面](#gui),以及 [容器化部署](#docker) - -歡迎在 [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues)、[Telegram 使用者群](https://t.me/+Z9_SgnxmsmA5NzBl) 或 [QQ 使用者群](https://qm.qq.com/q/DixZCxQej0) 中提出回饋 - -如需瞭解如何貢獻的詳細資訊,請查閱 [貢獻指南](https://github.com/Byaidu/PDFMathTranslate/wiki/Contribution-Guide---%E8%B4%A1%E7%8C%AE%E6%8C%87%E5%8D%97) - -

近期更新

- -- [Dec. 24 2024] 翻譯功能支援接入由 [Xinference](https://github.com/xorbitsai/inference) 執行的本機 LLM _(by [@imClumsyPanda](https://github.com/imClumsyPanda))_ -- [Nov. 26 2024] CLI 現在已支援(多個)線上 PDF 檔 *(by [@reycn](https://github.com/reycn))* -- [Nov. 24 2024] 為了降低依賴大小,提供 [ONNX](https://github.com/onnx/onnx) 支援 *(by [@Wybxc](https://github.com/Wybxc))* -- [Nov. 23 2024] 🌟 [免費公共服務](#demo) 上線! *(by [@Byaidu](https://github.com/Byaidu))* -- [Nov. 23 2024] 新增防止網頁爬蟲的防火牆 *(by [@Byaidu](https://github.com/Byaidu))* -- [Nov. 22 2024] 圖形使用者介面現已支援義大利語並進行了一些更新 *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))* -- [Nov. 22 2024] 現在你可以將自己部署的服務分享給朋友 *(by [@Zxis233](https://github.com/Zxis233))* -- [Nov. 22 2024] 支援騰訊翻譯 *(by [@hellofinch](https://github.com/hellofinch))* -- [Nov. 21 2024] 圖形使用者介面現在支援下載雙語文件 *(by [@reycn](https://github.com/reycn))* -- [Nov. 20 2024] 🌟 提供了 [線上示範](#demo)! *(by [@reycn](https://github.com/reycn))* - -

效果預覽

- -
- -
- -

線上示範 🌟

- -### 免費服務 () - -你可以立即嘗試 [免費公共服務](https://pdf2zh.com/) 而無需安裝 - -### 線上示範 - -你可以直接在 [HuggingFace 上的線上示範](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker)和[魔搭的線上示範](https://www.modelscope.cn/studios/AI-ModelScope/PDFMathTranslate)進行嘗試,無需安裝。 -請注意,示範使用的運算資源有限,請勿濫用。 - -

安裝與使用

- -我們提供了四種使用此專案的方法:[命令列工具](#cmd)、[便攜式安裝](#portable)、[圖形使用者介面](#gui) 與 [容器化部署](#docker)。 - -pdf2zh 在執行時需要額外下載模型(`wybxc/DocLayout-YOLO-DocStructBench-onnx`),該模型也可在魔搭(ModelScope)上取得。如果在啟動時下載該模型時遇到問題,請使用如下環境變數: -```shell -set HF_ENDPOINT=https://hf-mirror.com -``` - -

方法一、命令列工具

- -1. 確保已安裝 Python 版本大於 3.8 且小於 3.12 -2. 安裝此程式: - - ```bash - pip install pdf2zh - ``` - -3. 執行翻譯,生成檔案位於 [目前工作目錄](https://chatgpt.com/share/6745ed36-9acc-800e-8a90-59204bd13444): - - ```bash - pdf2zh document.pdf - ``` - -

方法二、便攜式安裝

- -無需預先安裝 Python 環境 - -下載 [setup.bat](https://raw.githubusercontent.com/Byaidu/PDFMathTranslate/refs/heads/main/script/setup.bat) 並直接雙擊執行 - -

方法三、圖形使用者介面

- -1. 確保已安裝 Python 版本大於 3.8 且小於 3.12 -2. 安裝此程式: - - ```bash - pip install pdf2zh - ``` - -3. 在瀏覽器中啟動使用: - - ```bash - pdf2zh -i - ``` - -4. 如果您的瀏覽器沒有自動開啟並跳轉,請手動在瀏覽器開啟: - - ```bash - http://localhost:7860/ - ``` - - - -查看 [documentation for GUI](/README_GUI.md) 以獲取詳細說明 - -

方法四、容器化部署

- -1. 拉取 Docker 映像檔並執行: - - ```bash - docker pull byaidu/pdf2zh - docker run -d -p 7860:7860 byaidu/pdf2zh - ``` - -2. 透過瀏覽器開啟: - - ``` - http://localhost:7860/ - ``` - -用於在雲服務上部署容器映像檔: - - - -

高級選項

- -在命令列中執行翻譯指令,並在目前工作目錄下生成譯文檔案 `example-mono.pdf` 和雙語對照檔案 `example-dual.pdf`。預設使用 Google 翻譯服務。 - -cmd - -以下表格列出了所有高級選項,供參考: - -| Option | 功能 | 範例 | -| -------- | ------- |------- | -| files | 本機檔案 | `pdf2zh ~/local.pdf` | -| links | 線上檔案 | `pdf2zh http://arxiv.org/paper.pdf` | -| `-i` | [進入圖形介面](#gui) | `pdf2zh -i` | -| `-p` | [僅翻譯部分文件](#partial) | `pdf2zh example.pdf -p 1` | -| `-li` | [原文語言](#language) | `pdf2zh example.pdf -li en` | -| `-lo` | [目標語言](#language) | `pdf2zh example.pdf -lo zh` | -| `-s` | [指定翻譯服務](#services) | `pdf2zh example.pdf -s deepl` | -| `-t` | [多執行緒](#threads) | `pdf2zh example.pdf -t 1` | -| `-o` | 輸出目錄 | `pdf2zh example.pdf -o output` | -| `-f`, `-c` | [例外規則](#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` | -| `--share` | [獲取 gradio 公開連結] | `pdf2zh -i --share` | -| `--authorized` | [[添加網頁認證及自訂認證頁面](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.)] | `pdf2zh -i --authorized users.txt [auth.html]` | -| `--prompt` | [使用自訂的大模型 Prompt] | `pdf2zh --prompt [prompt.txt]` | -| `--onnx` | [使用自訂的 DocLayout-YOLO ONNX 模型] | `pdf2zh --onnx [onnx/model/path]` | -| `--serverport` | [自訂 WebUI 埠號] | `pdf2zh --serverport 7860` | -| `--dir` | [資料夾翻譯] | `pdf2zh --dir /path/to/translate/` | - -

全文或部分文件翻譯

- -- **全文翻譯** - -```bash -pdf2zh example.pdf -``` - -- **部分翻譯** - -```bash -pdf2zh example.pdf -p 1-3,5 -``` - -

指定原文語言與目標語言

- -可參考 [Google 語言代碼](https://developers.google.com/admin-sdk/directory/v1/languages)、[DeepL 語言代碼](https://developers.deepl.com/docs/resources/supported-languages) - -```bash -pdf2zh example.pdf -li en -lo ja -``` - -

使用不同的翻譯服務

- -下表列出了每個翻譯服務所需的 [環境變數](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4)。在使用前,請先確保已設定好對應的變數。 - -|**Translator**|**Service**|**Environment Variables**|**Default Values**|**Notes**| -|-|-|-|-|-| -|**Google (Default)**|`google`|無|N/A|無| -|**Bing**|`bing`|無|N/A|無| -|**DeepL**|`deepl`|`DEEPL_AUTH_KEY`|`[Your Key]`|參閱 [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API)| -|**DeepLX**|`deeplx`|`DEEPLX_ENDPOINT`|`https://api.deepl.com/translate`|參閱 [DeepLX](https://github.com/OwO-Network/DeepLX)| -|**Ollama**|`ollama`|`OLLAMA_HOST`, `OLLAMA_MODEL`|`http://127.0.0.1:11434`, `gemma2`|參閱 [Ollama](https://github.com/ollama/ollama)| -|**OpenAI**|`openai`|`OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL`|`https://api.openai.com/v1`, `[Your Key]`, `gpt-4o-mini`|參閱 [OpenAI](https://platform.openai.com/docs/overview)| -|**AzureOpenAI**|`azure-openai`|`AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_MODEL`|`[Your Endpoint]`, `[Your Key]`, `gpt-4o-mini`|參閱 [Azure OpenAI](https://learn.microsoft.com/zh-cn/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython&pivots=programming-language-python)| -|**Zhipu**|`zhipu`|`ZHIPU_API_KEY`, `ZHIPU_MODEL`|`[Your Key]`, `glm-4-flash`|參閱 [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk)| -| **ModelScope** | `ModelScope` |`MODELSCOPE_API_KEY`, `MODELSCOPE_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-Coder-32B-Instruct`| 參閱 [ModelScope](https://www.modelscope.cn/docs/model-service/API-Inference/intro)| -|**Silicon**|`silicon`|`SILICON_API_KEY`, `SILICON_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-7B-Instruct`|參閱 [SiliconCloud](https://docs.siliconflow.cn/quickstart)| -|**Gemini**|`gemini`|`GEMINI_API_KEY`, `GEMINI_MODEL`|`[Your Key]`, `gemini-1.5-flash`|參閱 [Gemini](https://ai.google.dev/gemini-api/docs/openai)| -|**Azure**|`azure`|`AZURE_ENDPOINT`, `AZURE_API_KEY`|`https://api.translator.azure.cn`, `[Your Key]`|參閱 [Azure](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview)| -|**Tencent**|`tencent`|`TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`|`[Your ID]`, `[Your Key]`|參閱 [Tencent](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104)| -|**Dify**|`dify`|`DIFY_API_URL`, `DIFY_API_KEY`|`[Your DIFY URL]`, `[Your Key]`|參閱 [Dify](https://github.com/langgenius/dify),需要在 Dify 的工作流程輸入中定義三個變數:lang_out、lang_in、text。| -|**AnythingLLM**|`anythingllm`|`AnythingLLM_URL`, `AnythingLLM_APIKEY`|`[Your AnythingLLM URL]`, `[Your Key]`|參閱 [anything-llm](https://github.com/Mintplex-Labs/anything-llm)| -|**Argos Translate**|`argos`| | |參閱 [argos-translate](https://github.com/argosopentech/argos-translate)| -|**Grok**|`grok`| `GORK_API_KEY`, `GORK_MODEL` | `[Your GORK_API_KEY]`, `grok-2-1212` |參閱 [Grok](https://docs.x.ai/docs/overview)| -|**DeepSeek**|`deepseek`| `DEEPSEEK_API_KEY`, `DEEPSEEK_MODEL` | `[Your DEEPSEEK_API_KEY]`, `deepseek-chat` |參閱 [DeepSeek](https://www.deepseek.com/)| -|**OpenAI-Liked**|`openai-liked`| `OPENAILIKE_BASE_URL`, `OPENAILIKE_API_KEY`, `OPENAILIKE_MODEL` | `url`, `[Your Key]`, `model name` | 無 | - -對於不在上述表格中,但兼容 OpenAI API 的大語言模型,可以使用與 OpenAI 相同的方式設定環境變數。 - -使用 `-s service` 或 `-s service:model` 指定翻譯服務: - -```bash -pdf2zh example.pdf -s openai:gpt-4o-mini -``` - -或使用環境變數指定模型: - -```bash -set OPENAI_MODEL=gpt-4o-mini -pdf2zh example.pdf -s openai -``` - -

指定例外規則

- -使用正則表達式指定需要保留的公式字體與字元: - -```bash -pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])" -``` - -預設保留 `Latex`, `Mono`, `Code`, `Italic`, `Symbol` 以及 `Math` 字體: - -```bash -pdf2zh example.pdf -f "(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)" -``` - -

指定執行緒數量

- -使用 `-t` 參數指定翻譯使用的執行緒數量: - -```bash -pdf2zh example.pdf -t 1 -``` - -

自訂大模型 Prompt

- -使用 `--prompt` 指定在使用大模型翻譯時所採用的 Prompt 檔案。 - -```bash -pdf2zh example.pdf -pr prompt.txt -``` - -範例 `prompt.txt` 檔案內容: - -``` -[ - { - "role": "system", - "content": "You are a professional,authentic machine translation engine.", - }, - { - "role": "user", - "content": "Translate the following markdown source text to ${lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: ${text}\nTranslated Text:", - }, -] -``` - -在自訂 Prompt 檔案中,可以使用以下三個內建變數來傳遞參數: -|**變數名稱**|**說明**| -|-|-| -|`lang_in`|輸入語言| -|`lang_out`|輸出語言| -|`text`|需要翻譯的文本| - -

API

- -### Python - -```python -from pdf2zh import translate, translate_stream - -params = {"lang_in": "en", "lang_out": "zh", "service": "google", "thread": 4} -file_mono, file_dual = translate(files=["example.pdf"], **params)[0] -with open("example.pdf", "rb") as f: - stream_mono, stream_dual = translate_stream(stream=f.read(), **params) -``` - -### HTTP - -```bash -pip install pdf2zh[backend] -pdf2zh --flask -pdf2zh --celery worker -``` - -```bash -curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"lang_out\":\"zh\",\"service\":\"google\",\"thread\":4}" -{"id":"d9894125-2f4e-45ea-9d93-1a9068d2045a"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -{"info":{"n":13,"total":506},"state":"PROGRESS"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -{"state":"SUCCESS"} - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf - -curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE -``` - -

致謝

- -- 文件合併:[PyMuPDF](https://github.com/pymupdf/PyMuPDF) -- 文件解析:[Pdfminer.six](https://github.com/pdfminer/pdfminer.six) -- 文件提取:[MinerU](https://github.com/opendatalab/MinerU) -- 文件預覽:[Gradio PDF](https://github.com/freddyaboulton/gradio-pdf) -- 多執行緒翻譯:[MathTranslate](https://github.com/SUSYUSTC/MathTranslate) -- 版面解析:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) -- PDF 標準:[PDF Explained](https://zxyle.github.io/PDF-Explained/)、[PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/) -- 多語言字型:[Go Noto Universal](https://github.com/satbyy/go-noto-universal) - -

貢獻者

- - - - - -![Alt](https://repobeats.axiom.co/api/embed/dfa7583da5332a11468d686fbd29b92320a6a869.svg "Repobeats analytics image") - -

星標歷史

- - - - - - Star History Chart - + + +科學 PDF 文件翻譯及雙語對照工具 + +- 📊 保留公式、圖表、目錄和註釋 *([預覽效果](#preview))* +- 🌐 支援 [多種語言](#language) 和 [諸多翻譯服務](#services) +- 🤖 提供 [命令列工具](#usage)、[圖形使用者介面](#gui),以及 [容器化部署](#docker) + +歡迎在 [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues)、[Telegram 使用者群](https://t.me/+Z9_SgnxmsmA5NzBl) 或 [QQ 使用者群](https://qm.qq.com/q/DixZCxQej0) 中提出回饋 + +如需瞭解如何貢獻的詳細資訊,請查閱 [貢獻指南](https://github.com/Byaidu/PDFMathTranslate/wiki/Contribution-Guide---%E8%B4%A1%E7%8C%AE%E6%8C%87%E5%8D%97) + +

近期更新

+ +- [Dec. 24 2024] 翻譯功能支援接入由 [Xinference](https://github.com/xorbitsai/inference) 執行的本機 LLM _(by [@imClumsyPanda](https://github.com/imClumsyPanda))_ +- [Nov. 26 2024] CLI 現在已支援(多個)線上 PDF 檔 *(by [@reycn](https://github.com/reycn))* +- [Nov. 24 2024] 為了降低依賴大小,提供 [ONNX](https://github.com/onnx/onnx) 支援 *(by [@Wybxc](https://github.com/Wybxc))* +- [Nov. 23 2024] 🌟 [免費公共服務](#demo) 上線! *(by [@Byaidu](https://github.com/Byaidu))* +- [Nov. 23 2024] 新增防止網頁爬蟲的防火牆 *(by [@Byaidu](https://github.com/Byaidu))* +- [Nov. 22 2024] 圖形使用者介面現已支援義大利語並進行了一些更新 *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))* +- [Nov. 22 2024] 現在你可以將自己部署的服務分享給朋友 *(by [@Zxis233](https://github.com/Zxis233))* +- [Nov. 22 2024] 支援騰訊翻譯 *(by [@hellofinch](https://github.com/hellofinch))* +- [Nov. 21 2024] 圖形使用者介面現在支援下載雙語文件 *(by [@reycn](https://github.com/reycn))* +- [Nov. 20 2024] 🌟 提供了 [線上示範](#demo)! *(by [@reycn](https://github.com/reycn))* + +

效果預覽

+ +
+ +
+ +

線上示範 🌟

+ +### 免費服務 () + +你可以立即嘗試 [免費公共服務](https://pdf2zh.com/) 而無需安裝 + +### 線上示範 + +你可以直接在 [HuggingFace 上的線上示範](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker)和[魔搭的線上示範](https://www.modelscope.cn/studios/AI-ModelScope/PDFMathTranslate)進行嘗試,無需安裝。 +請注意,示範使用的運算資源有限,請勿濫用。 + +

安裝與使用

+ +我們提供了四種使用此專案的方法:[命令列工具](#cmd)、[便攜式安裝](#portable)、[圖形使用者介面](#gui) 與 [容器化部署](#docker)。 + +pdf2zh 在執行時需要額外下載模型(`wybxc/DocLayout-YOLO-DocStructBench-onnx`),該模型也可在魔搭(ModelScope)上取得。如果在啟動時下載該模型時遇到問題,請使用如下環境變數: +```shell +set HF_ENDPOINT=https://hf-mirror.com +``` + +

方法一、命令列工具

+ +1. 確保已安裝 Python 版本大於 3.8 且小於 3.12 +2. 安裝此程式: + + ```bash + pip install pdf2zh + ``` + +3. 執行翻譯,生成檔案位於 [目前工作目錄](https://chatgpt.com/share/6745ed36-9acc-800e-8a90-59204bd13444): + + ```bash + pdf2zh document.pdf + ``` + +

方法二、便攜式安裝

+ +無需預先安裝 Python 環境 + +下載 [setup.bat](https://raw.githubusercontent.com/Byaidu/PDFMathTranslate/refs/heads/main/script/setup.bat) 並直接雙擊執行 + +

方法三、圖形使用者介面

+ +1. 確保已安裝 Python 版本大於 3.8 且小於 3.12 +2. 安裝此程式: + + ```bash + pip install pdf2zh + ``` + +3. 在瀏覽器中啟動使用: + + ```bash + pdf2zh -i + ``` + +4. 如果您的瀏覽器沒有自動開啟並跳轉,請手動在瀏覽器開啟: + + ```bash + http://localhost:7860/ + ``` + + + +查看 [documentation for GUI](/README_GUI.md) 以獲取詳細說明 + +

方法四、容器化部署

+ +1. 拉取 Docker 映像檔並執行: + + ```bash + docker pull byaidu/pdf2zh + docker run -d -p 7860:7860 byaidu/pdf2zh + ``` + +2. 透過瀏覽器開啟: + + ``` + http://localhost:7860/ + ``` + +用於在雲服務上部署容器映像檔: + + + +

高級選項

+ +在命令列中執行翻譯指令,並在目前工作目錄下生成譯文檔案 `example-mono.pdf` 和雙語對照檔案 `example-dual.pdf`。預設使用 Google 翻譯服務。 + +cmd + +以下表格列出了所有高級選項,供參考: + +| Option | 功能 | 範例 | +| -------- | ------- |------- | +| files | 本機檔案 | `pdf2zh ~/local.pdf` | +| links | 線上檔案 | `pdf2zh http://arxiv.org/paper.pdf` | +| `-i` | [進入圖形介面](#gui) | `pdf2zh -i` | +| `-p` | [僅翻譯部分文件](#partial) | `pdf2zh example.pdf -p 1` | +| `-li` | [原文語言](#language) | `pdf2zh example.pdf -li en` | +| `-lo` | [目標語言](#language) | `pdf2zh example.pdf -lo zh` | +| `-s` | [指定翻譯服務](#services) | `pdf2zh example.pdf -s deepl` | +| `-t` | [多執行緒](#threads) | `pdf2zh example.pdf -t 1` | +| `-o` | 輸出目錄 | `pdf2zh example.pdf -o output` | +| `-f`, `-c` | [例外規則](#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` | +| `--share` | [獲取 gradio 公開連結] | `pdf2zh -i --share` | +| `--authorized` | [[添加網頁認證及自訂認證頁面](https://github.com/Byaidu/PDFMathTranslate/blob/main/docs/ADVANCED.)] | `pdf2zh -i --authorized users.txt [auth.html]` | +| `--prompt` | [使用自訂的大模型 Prompt] | `pdf2zh --prompt [prompt.txt]` | +| `--onnx` | [使用自訂的 DocLayout-YOLO ONNX 模型] | `pdf2zh --onnx [onnx/model/path]` | +| `--serverport` | [自訂 WebUI 埠號] | `pdf2zh --serverport 7860` | +| `--dir` | [資料夾翻譯] | `pdf2zh --dir /path/to/translate/` | + +

全文或部分文件翻譯

+ +- **全文翻譯** + +```bash +pdf2zh example.pdf +``` + +- **部分翻譯** + +```bash +pdf2zh example.pdf -p 1-3,5 +``` + +

指定原文語言與目標語言

+ +可參考 [Google 語言代碼](https://developers.google.com/admin-sdk/directory/v1/languages)、[DeepL 語言代碼](https://developers.deepl.com/docs/resources/supported-languages) + +```bash +pdf2zh example.pdf -li en -lo ja +``` + +

使用不同的翻譯服務

+ +下表列出了每個翻譯服務所需的 [環境變數](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4)。在使用前,請先確保已設定好對應的變數。 + +|**Translator**|**Service**|**Environment Variables**|**Default Values**|**Notes**| +|-|-|-|-|-| +|**Google (Default)**|`google`|無|N/A|無| +|**Bing**|`bing`|無|N/A|無| +|**DeepL**|`deepl`|`DEEPL_AUTH_KEY`|`[Your Key]`|參閱 [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API)| +|**DeepLX**|`deeplx`|`DEEPLX_ENDPOINT`|`https://api.deepl.com/translate`|參閱 [DeepLX](https://github.com/OwO-Network/DeepLX)| +|**Ollama**|`ollama`|`OLLAMA_HOST`, `OLLAMA_MODEL`|`http://127.0.0.1:11434`, `gemma2`|參閱 [Ollama](https://github.com/ollama/ollama)| +|**OpenAI**|`openai`|`OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL`|`https://api.openai.com/v1`, `[Your Key]`, `gpt-4o-mini`|參閱 [OpenAI](https://platform.openai.com/docs/overview)| +|**AzureOpenAI**|`azure-openai`|`AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_MODEL`|`[Your Endpoint]`, `[Your Key]`, `gpt-4o-mini`|參閱 [Azure OpenAI](https://learn.microsoft.com/zh-cn/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython&pivots=programming-language-python)| +|**Zhipu**|`zhipu`|`ZHIPU_API_KEY`, `ZHIPU_MODEL`|`[Your Key]`, `glm-4-flash`|參閱 [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk)| +| **ModelScope** | `ModelScope` |`MODELSCOPE_API_KEY`, `MODELSCOPE_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-Coder-32B-Instruct`| 參閱 [ModelScope](https://www.modelscope.cn/docs/model-service/API-Inference/intro)| +|**Silicon**|`silicon`|`SILICON_API_KEY`, `SILICON_MODEL`|`[Your Key]`, `Qwen/Qwen2.5-7B-Instruct`|參閱 [SiliconCloud](https://docs.siliconflow.cn/quickstart)| +|**Gemini**|`gemini`|`GEMINI_API_KEY`, `GEMINI_MODEL`|`[Your Key]`, `gemini-1.5-flash`|參閱 [Gemini](https://ai.google.dev/gemini-api/docs/openai)| +|**Azure**|`azure`|`AZURE_ENDPOINT`, `AZURE_API_KEY`|`https://api.translator.azure.cn`, `[Your Key]`|參閱 [Azure](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview)| +|**Tencent**|`tencent`|`TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`|`[Your ID]`, `[Your Key]`|參閱 [Tencent](https://www.tencentcloud.com/products/tmt?from_qcintl=122110104)| +|**Dify**|`dify`|`DIFY_API_URL`, `DIFY_API_KEY`|`[Your DIFY URL]`, `[Your Key]`|參閱 [Dify](https://github.com/langgenius/dify),需要在 Dify 的工作流程輸入中定義三個變數:lang_out、lang_in、text。| +|**AnythingLLM**|`anythingllm`|`AnythingLLM_URL`, `AnythingLLM_APIKEY`|`[Your AnythingLLM URL]`, `[Your Key]`|參閱 [anything-llm](https://github.com/Mintplex-Labs/anything-llm)| +|**Argos Translate**|`argos`| | |參閱 [argos-translate](https://github.com/argosopentech/argos-translate)| +|**Grok**|`grok`| `GORK_API_KEY`, `GORK_MODEL` | `[Your GORK_API_KEY]`, `grok-2-1212` |參閱 [Grok](https://docs.x.ai/docs/overview)| +|**DeepSeek**|`deepseek`| `DEEPSEEK_API_KEY`, `DEEPSEEK_MODEL` | `[Your DEEPSEEK_API_KEY]`, `deepseek-chat` |參閱 [DeepSeek](https://www.deepseek.com/)| +|**OpenAI-Liked**|`openai-liked`| `OPENAILIKE_BASE_URL`, `OPENAILIKE_API_KEY`, `OPENAILIKE_MODEL` | `url`, `[Your Key]`, `model name` | 無 | + +對於不在上述表格中,但兼容 OpenAI API 的大語言模型,可以使用與 OpenAI 相同的方式設定環境變數。 + +使用 `-s service` 或 `-s service:model` 指定翻譯服務: + +```bash +pdf2zh example.pdf -s openai:gpt-4o-mini +``` + +或使用環境變數指定模型: + +```bash +set OPENAI_MODEL=gpt-4o-mini +pdf2zh example.pdf -s openai +``` + +

指定例外規則

+ +使用正則表達式指定需要保留的公式字體與字元: + +```bash +pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])" +``` + +預設保留 `Latex`, `Mono`, `Code`, `Italic`, `Symbol` 以及 `Math` 字體: + +```bash +pdf2zh example.pdf -f "(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)" +``` + +

指定執行緒數量

+ +使用 `-t` 參數指定翻譯使用的執行緒數量: + +```bash +pdf2zh example.pdf -t 1 +``` + +

自訂大模型 Prompt

+ +使用 `--prompt` 指定在使用大模型翻譯時所採用的 Prompt 檔案。 + +```bash +pdf2zh example.pdf -pr prompt.txt +``` + +範例 `prompt.txt` 檔案內容: + +``` +[ + { + "role": "system", + "content": "You are a professional,authentic machine translation engine.", + }, + { + "role": "user", + "content": "Translate the following markdown source text to ${lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: ${text}\nTranslated Text:", + }, +] +``` + +在自訂 Prompt 檔案中,可以使用以下三個內建變數來傳遞參數: +|**變數名稱**|**說明**| +|-|-| +|`lang_in`|輸入語言| +|`lang_out`|輸出語言| +|`text`|需要翻譯的文本| + +

API

+ +### Python + +```python +from pdf2zh import translate, translate_stream + +params = {"lang_in": "en", "lang_out": "zh", "service": "google", "thread": 4} +file_mono, file_dual = translate(files=["example.pdf"], **params)[0] +with open("example.pdf", "rb") as f: + stream_mono, stream_dual = translate_stream(stream=f.read(), **params) +``` + +### HTTP + +```bash +pip install pdf2zh[backend] +pdf2zh --flask +pdf2zh --celery worker +``` + +```bash +curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"lang_out\":\"zh\",\"service\":\"google\",\"thread\":4}" +{"id":"d9894125-2f4e-45ea-9d93-1a9068d2045a"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a +{"info":{"n":13,"total":506},"state":"PROGRESS"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a +{"state":"SUCCESS"} + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf + +curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE +``` + +

致謝

+ +- 文件合併:[PyMuPDF](https://github.com/pymupdf/PyMuPDF) +- 文件解析:[Pdfminer.six](https://github.com/pdfminer/pdfminer.six) +- 文件提取:[MinerU](https://github.com/opendatalab/MinerU) +- 文件預覽:[Gradio PDF](https://github.com/freddyaboulton/gradio-pdf) +- 多執行緒翻譯:[MathTranslate](https://github.com/SUSYUSTC/MathTranslate) +- 版面解析:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) +- PDF 標準:[PDF Explained](https://zxyle.github.io/PDF-Explained/)、[PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/) +- 多語言字型:[Go Noto Universal](https://github.com/satbyy/go-noto-universal) + +

貢獻者

+ + + + + +![Alt](https://repobeats.axiom.co/api/embed/dfa7583da5332a11468d686fbd29b92320a6a869.svg "Repobeats analytics image") + +

星標歷史

+ + + + + + Star History Chart + \ No newline at end of file diff --git a/pdf2zh/backend.py b/pdf2zh/backend.py index 77cab1d9..f86f6672 100644 --- a/pdf2zh/backend.py +++ b/pdf2zh/backend.py @@ -1,96 +1,96 @@ -from flask import Flask, request, send_file -from celery import Celery, Task -from celery.result import AsyncResult -from pdf2zh import translate_stream -import tqdm -import json -import io -from pdf2zh.doclayout import ModelInstance -from pdf2zh.config import ConfigManager - -flask_app = Flask("pdf2zh") -flask_app.config.from_mapping( - CELERY=dict( - broker_url=ConfigManager.get("CELERY_BROKER", "redis://127.0.0.1:6379/0"), - result_backend=ConfigManager.get("CELERY_RESULT", "redis://127.0.0.1:6379/0"), - ) -) - - -def celery_init_app(app: Flask) -> Celery: - class FlaskTask(Task): - def __call__(self, *args, **kwargs): - with app.app_context(): - return self.run(*args, **kwargs) - - celery_app = Celery(app.name) - celery_app.config_from_object(app.config["CELERY"]) - celery_app.Task = FlaskTask - celery_app.set_default() - celery_app.autodiscover_tasks() - app.extensions["celery"] = celery_app - return celery_app - - -celery_app = celery_init_app(flask_app) - - -@celery_app.task(bind=True) -def translate_task( - self: Task, - stream: bytes, - args: dict, -): - def progress_bar(t: tqdm.tqdm): - self.update_state(state="PROGRESS", meta={"n": t.n, "total": t.total}) # noqa - print(f"Translating {t.n} / {t.total} pages") - - doc_mono, doc_dual = translate_stream( - stream, - callback=progress_bar, - model=ModelInstance.value, - **args, - ) - return doc_mono, doc_dual - - -@flask_app.route("/v1/translate", methods=["POST"]) -def create_translate_tasks(): - file = request.files["file"] - stream = file.stream.read() - print(request.form.get("data")) - args = json.loads(request.form.get("data")) - task = translate_task.delay(stream, args) - return {"id": task.id} - - -@flask_app.route("/v1/translate/", methods=["GET"]) -def get_translate_task(id: str): - result: AsyncResult = celery_app.AsyncResult(id) - if str(result.state) == "PROGRESS": - return {"state": str(result.state), "info": result.info} - else: - return {"state": str(result.state)} - - -@flask_app.route("/v1/translate/", methods=["DELETE"]) -def delete_translate_task(id: str): - result: AsyncResult = celery_app.AsyncResult(id) - result.revoke(terminate=True) - return {"state": str(result.state)} - - -@flask_app.route("/v1/translate//") -def get_translate_result(id: str, format: str): - result = celery_app.AsyncResult(id) - if not result.ready(): - return {"error": "task not finished"}, 400 - if not result.successful(): - return {"error": "task failed"}, 400 - doc_mono, doc_dual = result.get() - to_send = doc_mono if format == "mono" else doc_dual - return send_file(io.BytesIO(to_send), "application/pdf") - - -if __name__ == "__main__": - flask_app.run() +from flask import Flask, request, send_file +from celery import Celery, Task +from celery.result import AsyncResult +from pdf2zh import translate_stream +import tqdm +import json +import io +from pdf2zh.doclayout import ModelInstance +from pdf2zh.config import ConfigManager + +flask_app = Flask("pdf2zh") +flask_app.config.from_mapping( + CELERY=dict( + broker_url=ConfigManager.get("CELERY_BROKER", "redis://127.0.0.1:6379/0"), + result_backend=ConfigManager.get("CELERY_RESULT", "redis://127.0.0.1:6379/0"), + ) +) + + +def celery_init_app(app: Flask) -> Celery: + class FlaskTask(Task): + def __call__(self, *args, **kwargs): + with app.app_context(): + return self.run(*args, **kwargs) + + celery_app = Celery(app.name) + celery_app.config_from_object(app.config["CELERY"]) + celery_app.Task = FlaskTask + celery_app.set_default() + celery_app.autodiscover_tasks() + app.extensions["celery"] = celery_app + return celery_app + + +celery_app = celery_init_app(flask_app) + + +@celery_app.task(bind=True) +def translate_task( + self: Task, + stream: bytes, + args: dict, +): + def progress_bar(t: tqdm.tqdm): + self.update_state(state="PROGRESS", meta={"n": t.n, "total": t.total}) # noqa + print(f"Translating {t.n} / {t.total} pages") + + doc_mono, doc_dual = translate_stream( + stream, + callback=progress_bar, + model=ModelInstance.value, + **args, + ) + return doc_mono, doc_dual + + +@flask_app.route("/v1/translate", methods=["POST"]) +def create_translate_tasks(): + file = request.files["file"] + stream = file.stream.read() + print(request.form.get("data")) + args = json.loads(request.form.get("data")) + task = translate_task.delay(stream, args) + return {"id": task.id} + + +@flask_app.route("/v1/translate/", methods=["GET"]) +def get_translate_task(id: str): + result: AsyncResult = celery_app.AsyncResult(id) + if str(result.state) == "PROGRESS": + return {"state": str(result.state), "info": result.info} + else: + return {"state": str(result.state)} + + +@flask_app.route("/v1/translate/", methods=["DELETE"]) +def delete_translate_task(id: str): + result: AsyncResult = celery_app.AsyncResult(id) + result.revoke(terminate=True) + return {"state": str(result.state)} + + +@flask_app.route("/v1/translate//") +def get_translate_result(id: str, format: str): + result = celery_app.AsyncResult(id) + if not result.ready(): + return {"error": "task not finished"}, 400 + if not result.successful(): + return {"error": "task failed"}, 400 + doc_mono, doc_dual = result.get() + to_send = doc_mono if format == "mono" else doc_dual + return send_file(io.BytesIO(to_send), "application/pdf") + + +if __name__ == "__main__": + flask_app.run() diff --git a/pdf2zh/config.py b/pdf2zh/config.py index 0cee33aa..050a39e5 100644 --- a/pdf2zh/config.py +++ b/pdf2zh/config.py @@ -1,214 +1,214 @@ -import json -from pathlib import Path -from threading import RLock # 改成 RLock -import os -import copy - - -class ConfigManager: - _instance = None - _lock = RLock() # 用 RLock 替换 Lock,允许在同一个线程中重复获取锁 - - @classmethod - def get_instance(cls): - """获取单例实例""" - # 先判断是否存在实例,如果不存在再加锁进行初始化 - if cls._instance is None: - with cls._lock: - if cls._instance is None: - cls._instance = cls() - return cls._instance - - def __init__(self): - # 防止重复初始化 - if hasattr(self, "_initialized") and self._initialized: - return - self._initialized = True - - self._config_path = Path.home() / ".config" / "PDFMathTranslate" / "config.json" - self._config_data = {} - - # 这里不要再加锁,因为外层可能已经加了锁 (get_instance), RLock也无妨 - self._ensure_config_exists() - - def _ensure_config_exists(self, isInit=True): - """确保配置文件存在,如果不存在则创建默认配置""" - # 这里也不需要显式再次加锁,原因同上,方法体中再调用 _load_config(), - # 而 _load_config() 内部会加锁。因为 RLock 是可重入的,不会阻塞。 - if not self._config_path.exists(): - if isInit: - self._config_path.parent.mkdir(parents=True, exist_ok=True) - self._config_data = {} # 默认配置内容 - self._save_config() - else: - raise ValueError(f"config file {self._config_path} not found!") - else: - self._load_config() - - def _load_config(self): - """从 config.json 中加载配置""" - with self._lock: # 加锁确保线程安全 - with self._config_path.open("r", encoding="utf-8") as f: - self._config_data = json.load(f) - - def _save_config(self): - """保存配置到 config.json""" - with self._lock: # 加锁确保线程安全 - # 移除循环引用并写入 - cleaned_data = self._remove_circular_references(self._config_data) - with self._config_path.open("w", encoding="utf-8") as f: - json.dump(cleaned_data, f, indent=4, ensure_ascii=False) - - def _remove_circular_references(self, obj, seen=None): - """递归移除循环引用""" - if seen is None: - seen = set() - obj_id = id(obj) - if obj_id in seen: - return None # 遇到已处理过的对象,视为循环引用 - seen.add(obj_id) - - if isinstance(obj, dict): - return { - k: self._remove_circular_references(v, seen) for k, v in obj.items() - } - elif isinstance(obj, list): - return [self._remove_circular_references(i, seen) for i in obj] - return obj - - @classmethod - def custome_config(cls, file_path): - """使用自定义路径加载配置文件""" - custom_path = Path(file_path) - if not custom_path.exists(): - raise ValueError(f"Config file {custom_path} not found!") - # 加锁 - with cls._lock: - instance = cls() - instance._config_path = custom_path - # 此处传 isInit=False,若不存在则报错;若存在则正常 _load_config() - instance._ensure_config_exists(isInit=False) - cls._instance = instance - - @classmethod - def get(cls, key, default=None): - """获取配置值""" - instance = cls.get_instance() - # 读取时,加锁或不加锁都行。但为了统一,我们在修改配置前后都要加锁。 - # get 只要最终需要保存,则会加锁 -> _save_config() - if key in instance._config_data: - return instance._config_data[key] - - # 若环境变量中存在该 key,则使用环境变量并写回 config - if key in os.environ: - value = os.environ[key] - instance._config_data[key] = value - instance._save_config() - return value - - # 若 default 不为 None,则设置并保存 - if default is not None: - instance._config_data[key] = default - instance._save_config() - return default - - # 找不到则抛出异常 - # raise KeyError(f"{key} is not found in config file or environment variables.") - return default - - @classmethod - def set(cls, key, value): - """设置配置值并保存""" - instance = cls.get_instance() - with instance._lock: - instance._config_data[key] = value - instance._save_config() - - @classmethod - def get_translator_by_name(cls, name): - """根据 name 获取对应的 translator 配置""" - instance = cls.get_instance() - translators = instance._config_data.get("translators", []) - for translator in translators: - if translator.get("name") == name: - return translator["envs"] - return None - - @classmethod - def set_translator_by_name(cls, name, new_translator_envs): - """根据 name 设置或更新 translator 配置""" - instance = cls.get_instance() - with instance._lock: - translators = instance._config_data.get("translators", []) - for translator in translators: - if translator.get("name") == name: - translator["envs"] = copy.deepcopy(new_translator_envs) - instance._save_config() - return - translators.append( - {"name": name, "envs": copy.deepcopy(new_translator_envs)} - ) - instance._config_data["translators"] = translators - instance._save_config() - - @classmethod - def get_env_by_translatername(cls, translater_name, name, default=None): - """根据 name 获取对应的 translator 配置""" - instance = cls.get_instance() - translators = instance._config_data.get("translators", []) - for translator in translators: - if translator.get("name") == translater_name.name: - if translator["envs"][name]: - return translator["envs"][name] - else: - with instance._lock: - translator["envs"][name] = default - instance._save_config() - return default - - with instance._lock: - translators = instance._config_data.get("translators", []) - for translator in translators: - if translator.get("name") == translater_name.name: - translator["envs"][name] = default - instance._save_config() - return default - translators.append( - { - "name": translater_name.name, - "envs": copy.deepcopy(translater_name.envs), - } - ) - instance._config_data["translators"] = translators - instance._save_config() - return default - - @classmethod - def delete(cls, key): - """删除配置值并保存""" - instance = cls.get_instance() - with instance._lock: - if key in instance._config_data: - del instance._config_data[key] - instance._save_config() - - @classmethod - def clear(cls): - """删除配置值并保存""" - instance = cls.get_instance() - with instance._lock: - instance._config_data = {} - instance._save_config() - - @classmethod - def all(cls): - """返回所有配置项""" - instance = cls.get_instance() - # 这里只做读取操作,一般可不加锁。不过为了保险也可以加锁。 - return instance._config_data - - @classmethod - def remove(cls): - instance = cls.get_instance() - with instance._lock: - os.remove(instance._config_path) +import json +from pathlib import Path +from threading import RLock # 改成 RLock +import os +import copy + + +class ConfigManager: + _instance = None + _lock = RLock() # 用 RLock 替换 Lock,允许在同一个线程中重复获取锁 + + @classmethod + def get_instance(cls): + """获取单例实例""" + # 先判断是否存在实例,如果不存在再加锁进行初始化 + if cls._instance is None: + with cls._lock: + if cls._instance is None: + cls._instance = cls() + return cls._instance + + def __init__(self): + # 防止重复初始化 + if hasattr(self, "_initialized") and self._initialized: + return + self._initialized = True + + self._config_path = Path.home() / ".config" / "PDFMathTranslate" / "config.json" + self._config_data = {} + + # 这里不要再加锁,因为外层可能已经加了锁 (get_instance), RLock也无妨 + self._ensure_config_exists() + + def _ensure_config_exists(self, isInit=True): + """确保配置文件存在,如果不存在则创建默认配置""" + # 这里也不需要显式再次加锁,原因同上,方法体中再调用 _load_config(), + # 而 _load_config() 内部会加锁。因为 RLock 是可重入的,不会阻塞。 + if not self._config_path.exists(): + if isInit: + self._config_path.parent.mkdir(parents=True, exist_ok=True) + self._config_data = {} # 默认配置内容 + self._save_config() + else: + raise ValueError(f"config file {self._config_path} not found!") + else: + self._load_config() + + def _load_config(self): + """从 config.json 中加载配置""" + with self._lock: # 加锁确保线程安全 + with self._config_path.open("r", encoding="utf-8") as f: + self._config_data = json.load(f) + + def _save_config(self): + """保存配置到 config.json""" + with self._lock: # 加锁确保线程安全 + # 移除循环引用并写入 + cleaned_data = self._remove_circular_references(self._config_data) + with self._config_path.open("w", encoding="utf-8") as f: + json.dump(cleaned_data, f, indent=4, ensure_ascii=False) + + def _remove_circular_references(self, obj, seen=None): + """递归移除循环引用""" + if seen is None: + seen = set() + obj_id = id(obj) + if obj_id in seen: + return None # 遇到已处理过的对象,视为循环引用 + seen.add(obj_id) + + if isinstance(obj, dict): + return { + k: self._remove_circular_references(v, seen) for k, v in obj.items() + } + elif isinstance(obj, list): + return [self._remove_circular_references(i, seen) for i in obj] + return obj + + @classmethod + def custome_config(cls, file_path): + """使用自定义路径加载配置文件""" + custom_path = Path(file_path) + if not custom_path.exists(): + raise ValueError(f"Config file {custom_path} not found!") + # 加锁 + with cls._lock: + instance = cls() + instance._config_path = custom_path + # 此处传 isInit=False,若不存在则报错;若存在则正常 _load_config() + instance._ensure_config_exists(isInit=False) + cls._instance = instance + + @classmethod + def get(cls, key, default=None): + """获取配置值""" + instance = cls.get_instance() + # 读取时,加锁或不加锁都行。但为了统一,我们在修改配置前后都要加锁。 + # get 只要最终需要保存,则会加锁 -> _save_config() + if key in instance._config_data: + return instance._config_data[key] + + # 若环境变量中存在该 key,则使用环境变量并写回 config + if key in os.environ: + value = os.environ[key] + instance._config_data[key] = value + instance._save_config() + return value + + # 若 default 不为 None,则设置并保存 + if default is not None: + instance._config_data[key] = default + instance._save_config() + return default + + # 找不到则抛出异常 + # raise KeyError(f"{key} is not found in config file or environment variables.") + return default + + @classmethod + def set(cls, key, value): + """设置配置值并保存""" + instance = cls.get_instance() + with instance._lock: + instance._config_data[key] = value + instance._save_config() + + @classmethod + def get_translator_by_name(cls, name): + """根据 name 获取对应的 translator 配置""" + instance = cls.get_instance() + translators = instance._config_data.get("translators", []) + for translator in translators: + if translator.get("name") == name: + return translator["envs"] + return None + + @classmethod + def set_translator_by_name(cls, name, new_translator_envs): + """根据 name 设置或更新 translator 配置""" + instance = cls.get_instance() + with instance._lock: + translators = instance._config_data.get("translators", []) + for translator in translators: + if translator.get("name") == name: + translator["envs"] = copy.deepcopy(new_translator_envs) + instance._save_config() + return + translators.append( + {"name": name, "envs": copy.deepcopy(new_translator_envs)} + ) + instance._config_data["translators"] = translators + instance._save_config() + + @classmethod + def get_env_by_translatername(cls, translater_name, name, default=None): + """根据 name 获取对应的 translator 配置""" + instance = cls.get_instance() + translators = instance._config_data.get("translators", []) + for translator in translators: + if translator.get("name") == translater_name.name: + if translator["envs"][name]: + return translator["envs"][name] + else: + with instance._lock: + translator["envs"][name] = default + instance._save_config() + return default + + with instance._lock: + translators = instance._config_data.get("translators", []) + for translator in translators: + if translator.get("name") == translater_name.name: + translator["envs"][name] = default + instance._save_config() + return default + translators.append( + { + "name": translater_name.name, + "envs": copy.deepcopy(translater_name.envs), + } + ) + instance._config_data["translators"] = translators + instance._save_config() + return default + + @classmethod + def delete(cls, key): + """删除配置值并保存""" + instance = cls.get_instance() + with instance._lock: + if key in instance._config_data: + del instance._config_data[key] + instance._save_config() + + @classmethod + def clear(cls): + """删除配置值并保存""" + instance = cls.get_instance() + with instance._lock: + instance._config_data = {} + instance._save_config() + + @classmethod + def all(cls): + """返回所有配置项""" + instance = cls.get_instance() + # 这里只做读取操作,一般可不加锁。不过为了保险也可以加锁。 + return instance._config_data + + @classmethod + def remove(cls): + instance = cls.get_instance() + with instance._lock: + os.remove(instance._config_path) diff --git a/pdf2zh/converter.py b/pdf2zh/converter.py index b72e2df3..7e031e3d 100644 --- a/pdf2zh/converter.py +++ b/pdf2zh/converter.py @@ -1,535 +1,535 @@ -from typing import Dict, List -from enum import Enum - -from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager -from pdfminer.pdffont import PDFCIDFont -from pdfminer.converter import PDFConverter -from pdfminer.pdffont import PDFUnicodeNotDefined -from pdfminer.utils import apply_matrix_pt, mult_matrix -from pdfminer.layout import ( - LTChar, - LTFigure, - LTLine, - LTPage, -) -import logging -import re -import concurrent.futures -import numpy as np -import unicodedata -from tenacity import retry, wait_fixed -from pdf2zh.translator import ( - AzureOpenAITranslator, - BaseTranslator, - GoogleTranslator, - BingTranslator, - DeepLTranslator, - DeepLXTranslator, - OllamaTranslator, - OpenAITranslator, - ZhipuTranslator, - ModelScopeTranslator, - SiliconTranslator, - GeminiTranslator, - AzureTranslator, - TencentTranslator, - DifyTranslator, - AnythingLLMTranslator, - XinferenceTranslator, - ArgosTranslator, - GorkTranslator, - GroqTranslator, - DeepseekTranslator, - OpenAIlikedTranslator, -) -from pymupdf import Font - -log = logging.getLogger(__name__) - - -class PDFConverterEx(PDFConverter): - def __init__( - self, - rsrcmgr: PDFResourceManager, - ) -> None: - PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None) - - def begin_page(self, page, ctm) -> None: - # 重载替换 cropbox - (x0, y0, x1, y1) = page.cropbox - (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) - (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) - mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) - self.cur_item = LTPage(page.pageno, mediabox) - - def end_page(self, page): - # 重载返回指令流 - return self.receive_layout(self.cur_item) - - def begin_figure(self, name, bbox, matrix) -> None: - # 重载设置 pageid - self._stack.append(self.cur_item) - self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) - self.cur_item.pageid = self._stack[-1].pageid - - def end_figure(self, _: str) -> None: - # 重载返回指令流 - fig = self.cur_item - assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) - self.cur_item = self._stack.pop() - self.cur_item.add(fig) - return self.receive_layout(fig) - - def render_char( - self, - matrix, - font, - fontsize: float, - scaling: float, - rise: float, - cid: int, - ncs, - graphicstate: PDFGraphicState, - ) -> float: - # 重载设置 cid 和 font - try: - text = font.to_unichr(cid) - assert isinstance(text, str), str(type(text)) - except PDFUnicodeNotDefined: - text = self.handle_undefined_char(font, cid) - textwidth = font.char_width(cid) - textdisp = font.char_disp(cid) - item = LTChar( - matrix, - font, - fontsize, - scaling, - rise, - text, - textwidth, - textdisp, - ncs, - graphicstate, - ) - self.cur_item.add(item) - item.cid = cid # hack 插入原字符编码 - item.font = font # hack 插入原字符字体 - return item.adv - - -class Paragraph: - def __init__(self, y, x, x0, x1, y0, y1, size, brk): - self.y: float = y # 初始纵坐标 - self.x: float = x # 初始横坐标 - self.x0: float = x0 # 左边界 - self.x1: float = x1 # 右边界 - self.y0: float = y0 # 上边界 - self.y1: float = y1 # 下边界 - self.size: float = size # 字体大小 - self.brk: bool = brk # 换行标记 - - -# fmt: off -class TranslateConverter(PDFConverterEx): - def __init__( - self, - rsrcmgr, - vfont: str = None, - vchar: str = None, - thread: int = 0, - layout={}, - lang_in: str = "", - lang_out: str = "", - service: str = "", - noto_name: str = "", - noto: Font = None, - envs: Dict = None, - prompt: List = None, - ) -> None: - super().__init__(rsrcmgr) - self.vfont = vfont - self.vchar = vchar - self.thread = thread - self.layout = layout - self.noto_name = noto_name - self.noto = noto - self.translator: BaseTranslator = None - param = service.split(":", 1) - service_name = param[0] - service_model = param[1] if len(param) > 1 else None - if not envs: - envs = {} - if not prompt: - prompt = [] - for translator in [GoogleTranslator, BingTranslator, DeepLTranslator, DeepLXTranslator, OllamaTranslator, XinferenceTranslator, AzureOpenAITranslator, - OpenAITranslator, ZhipuTranslator, ModelScopeTranslator, SiliconTranslator, GeminiTranslator, AzureTranslator, TencentTranslator, DifyTranslator, AnythingLLMTranslator, ArgosTranslator, GorkTranslator, GroqTranslator, DeepseekTranslator, OpenAIlikedTranslator,]: - if service_name == translator.name: - self.translator = translator(lang_in, lang_out, service_model, envs=envs, prompt=prompt) - if not self.translator: - raise ValueError("Unsupported translation service") - - def receive_layout(self, ltpage: LTPage): - # 段落 - sstk: list[str] = [] # 段落文字栈 - pstk: list[Paragraph] = [] # 段落属性栈 - vbkt: int = 0 # 段落公式括号计数 - # 公式组 - vstk: list[LTChar] = [] # 公式符号组 - vlstk: list[LTLine] = [] # 公式线条组 - vfix: float = 0 # 公式纵向偏移 - # 公式组栈 - var: list[list[LTChar]] = [] # 公式符号组栈 - varl: list[list[LTLine]] = [] # 公式线条组栈 - varf: list[float] = [] # 公式纵向偏移栈 - vlen: list[float] = [] # 公式宽度栈 - # 全局 - lstk: list[LTLine] = [] # 全局线条栈 - xt: LTChar = None # 上一个字符 - xt_cls: int = -1 # 上一个字符所属段落,保证无论第一个字符属于哪个类别都可以触发新段落 - vmax: float = ltpage.width / 4 # 行内公式最大宽度 - ops: str = "" # 渲染结果 - - def vflag(font: str, char: str): # 匹配公式(和角标)字体 - if isinstance(font, bytes): # 不一定能 decode,直接转 str - try: - font = font.decode('utf-8') # 尝试使用 UTF-8 解码 - except UnicodeDecodeError: - font = "" - font = font.split("+")[-1] # 字体名截断 - if re.match(r"\(cid:", char): - return True - # 基于字体名规则的判定 - if self.vfont: - if re.match(self.vfont, font): - return True - else: - if re.match( # latex 字体 - r"(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)", - font, - ): - return True - # 基于字符集规则的判定 - if self.vchar: - if re.match(self.vchar, char): - return True - else: - if ( - char - and char != " " # 非空格 - and ( - unicodedata.category(char[0]) - in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号 - or ord(char[0]) in range(0x370, 0x400) # 希腊字母 - ) - ): - return True - return False - - ############################################################ - # A. 原文档解析 - for child in ltpage: - if isinstance(child, LTChar): - cur_v = False - layout = self.layout[ltpage.pageid] - # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape - h, w = layout.shape - # 读取当前字符在 layout 中的类别 - cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1) - cls = layout[cy, cx] - # 锚定文档中 bullet 的位置 - if child.get_text() == "•": - cls = 0 - # 判定当前字符是否属于公式 - if ( # 判定当前字符是否属于公式 - cls == 0 # 1. 类别为保留区域 - or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况 - or vflag(child.fontname, child.get_text()) # 3. 公式字体 - or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体 - ): - cur_v = True - # 判定括号组是否属于公式 - if not cur_v: - if vstk and child.get_text() == "(": - cur_v = True - vbkt += 1 - if vbkt and child.get_text() == ")": - cur_v = True - vbkt -= 1 - if ( # 判定当前公式是否结束 - not cur_v # 1. 当前字符不属于公式 - or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落 - # or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分 - # 禁止纯公式(代码)段落换行,直到文字开始再重开文字段落,保证只存在两种情况 - # A. 纯公式(代码)段落(锚定绝对位置)sstk[-1]=="" -> sstk[-1]=="{v*}" - # B. 文字开头段落(排版相对位置)sstk[-1]!="" - or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax) # 因为 cls==xt_cls==0 一定有 sstk[-1]=="",所以这里不需要再判定 cls!=0 - ): - if vstk: - if ( # 根据公式右侧的文字修正公式的纵向偏移 - not cur_v # 1. 当前字符不属于公式 - and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 - and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧 - ): - vfix = vstk[0].y0 - child.y0 - if sstk[-1] == "": - xt_cls = -1 # 禁止纯公式段落(sstk[-1]=="{v*}")的后续连接,但是要考虑新字符和后续字符的连接,所以这里修改的是上个字符的类别 - sstk[-1] += f"{{v{len(var)}}}" - var.append(vstk) - varl.append(vlstk) - varf.append(vfix) - vstk = [] - vlstk = [] - vfix = 0 - # 当前字符不属于公式或当前字符是公式的第一个字符 - if not vstk: - if cls == xt_cls: # 当前字符与前一个字符属于同一段落 - if child.x0 > xt.x1 + 1: # 添加行内空格 - sstk[-1] += " " - elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行 - sstk[-1] += " " - pstk[-1].brk = True - else: # 根据当前字符构建一个新的段落 - sstk.append("") - pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.y0, child.y1, child.size, False)) - if not cur_v: # 文字入栈 - if ( # 根据当前字符修正段落属性 - child.size > pstk[-1].size # 1. 当前字符比段落字体大 - or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况) - ) and child.get_text() != " ": # 3. 当前字符不是空格 - pstk[-1].y -= child.size - pstk[-1].size # 修正段落初始纵坐标,假设两个不同大小字符的上边界对齐 - pstk[-1].size = child.size - sstk[-1] += child.get_text() - else: # 公式入栈 - if ( # 根据公式左侧的文字修正公式的纵向偏移 - not vstk # 1. 当前字符是公式的第一个字符 - and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 - and child.x0 > xt.x0 # 3. 前一个字符在公式左侧 - ): - vfix = child.y0 - xt.y0 - vstk.append(child) - # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理 - pstk[-1].x0 = min(pstk[-1].x0, child.x0) - pstk[-1].x1 = max(pstk[-1].x1, child.x1) - pstk[-1].y0 = min(pstk[-1].y0, child.y0) - pstk[-1].y1 = max(pstk[-1].y1, child.y1) - # 更新上一个字符 - xt = child - xt_cls = cls - elif isinstance(child, LTFigure): # 图表 - pass - elif isinstance(child, LTLine): # 线条 - layout = self.layout[ltpage.pageid] - # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape - h, w = layout.shape - # 读取当前线条在 layout 中的类别 - cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1) - cls = layout[cy, cx] - if vstk and cls == xt_cls: # 公式线条 - vlstk.append(child) - else: # 全局线条 - lstk.append(child) - else: - pass - # 处理结尾 - if vstk: # 公式出栈 - sstk[-1] += f"{{v{len(var)}}}" - var.append(vstk) - varl.append(vlstk) - varf.append(vfix) - log.debug("\n==========[VSTACK]==========\n") - for id, v in enumerate(var): # 计算公式宽度 - l = max([vch.x1 for vch in v]) - v[0].x0 - log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > v{id} = {"".join([ch.get_text() for ch in v])}') - vlen.append(l) - - ############################################################ - # B. 段落翻译 - log.debug("\n==========[SSTACK]==========\n") - - @retry(wait=wait_fixed(1)) - def worker(s: str): # 多线程翻译 - if not s.strip() or re.match(r"^\{v\d+\}$", s): # 空白和公式不翻译 - return s - try: - new = self.translator.translate(s) - return new - except BaseException as e: - if log.isEnabledFor(logging.DEBUG): - log.exception(e) - else: - log.exception(e, exc_info=False) - raise e - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.thread - ) as executor: - news = list(executor.map(worker, sstk)) - - ############################################################ - # C. 新文档排版 - def raw_string(fcur: str, cstk: str): # 编码字符串 - if fcur == self.noto_name: - return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk]) - elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度 - return "".join(["%04x" % ord(c) for c in cstk]) - else: - return "".join(["%02x" % ord(c) for c in cstk]) - - # 根据目标语言获取默认行距 - LANG_LINEHEIGHT_MAP = { - "zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, - "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8 - } - default_line_height = LANG_LINEHEIGHT_MAP.get(self.translator.lang_out.lower(), 1.1) # 小语种默认1.1 - _x, _y = 0, 0 - ops_list = [] - - def gen_op_txt(font, size, x, y, rtxt): - return f"/{font} {size:f} Tf 1 0 0 1 {x:f} {y:f} Tm [<{rtxt}>] TJ " - - def gen_op_line(x, y, xlen, ylen, linewidth): - return f"ET q 1 0 0 1 {x:f} {y:f} cm [] 0 d 0 J {linewidth:f} w 0 0 m {xlen:f} {ylen:f} l S Q BT " - - for id, new in enumerate(news): - x: float = pstk[id].x # 段落初始横坐标 - y: float = pstk[id].y # 段落初始纵坐标 - x0: float = pstk[id].x0 # 段落左边界 - x1: float = pstk[id].x1 # 段落右边界 - height: float = pstk[id].y1 - pstk[id].y0 # 段落高度 - size: float = pstk[id].size # 段落字体大小 - brk: bool = pstk[id].brk # 段落换行标记 - cstk: str = "" # 当前文字栈 - fcur: str = None # 当前字体 ID - lidx = 0 # 记录换行次数 - tx = x - fcur_ = fcur - ptr = 0 - log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}") - - ops_vals: list[dict] = [] - - while ptr < len(new): - vy_regex = re.match( - r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE - ) # 匹配 {vn} 公式标记 - mod = 0 # 文字修饰符 - if vy_regex: # 加载公式 - ptr += len(vy_regex.group(0)) - try: - vid = int(vy_regex.group(1).replace(" ", "")) - adv = vlen[vid] - except Exception: - continue # 翻译器可能会自动补个越界的公式标记 - if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符 - mod = var[vid][-1].width - else: # 加载文字 - ch = new[ptr] - fcur_ = None - try: - if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch: - fcur_ = "tiro" # 默认拉丁字体 - except Exception: - pass - if fcur_ is None: - fcur_ = self.noto_name # 默认非拉丁字体 - if fcur_ == self.noto_name: # FIXME: change to CONST - adv = self.noto.char_lengths(ch, size)[0] - else: - adv = self.fontmap[fcur_].char_width(ord(ch)) * size - ptr += 1 - if ( # 输出文字缓冲区 - fcur_ != fcur # 1. 字体更新 - or vy_regex # 2. 插入公式 - or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差) - ): - if cstk: - ops_vals.append({ - "type": OpType.TEXT, - "font": fcur, - "size": size, - "x": tx, - "dy": 0, - "rtxt": raw_string(fcur, cstk), - "lidx": lidx - }) - cstk = "" - if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行 - x = x0 - lidx += 1 - if vy_regex: # 插入公式 - fix = 0 - if fcur is not None: # 段落内公式修正纵向偏移 - fix = varf[vid] - for vch in var[vid]: # 排版公式字符 - vc = chr(vch.cid) - ops_vals.append({ - "type": OpType.TEXT, - "font": self.fontid[vch.font], - "size": vch.size, - "x": x + vch.x0 - var[vid][0].x0, - "dy": fix + vch.y0 - var[vid][0].y0, - "rtxt": raw_string(self.fontid[vch.font], vc), - "lidx": lidx - }) - if log.isEnabledFor(logging.DEBUG): - lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0))) - _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0 - for l in varl[vid]: # 排版公式线条 - if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景 - ops_vals.append({ - "type": OpType.LINE, - "x": l.pts[0][0] + x - var[vid][0].x0, - "dy": l.pts[0][1] + fix - var[vid][0].y0, - "linewidth": l.linewidth, - "xlen": l.pts[1][0] - l.pts[0][0], - "ylen": l.pts[1][1] - l.pts[0][1], - "lidx": lidx - }) - else: # 插入文字缓冲区 - if not cstk: # 单行开头 - tx = x - if x == x0 and ch == " ": # 消除段落换行空格 - adv = 0 - else: - cstk += ch - else: - cstk += ch - adv -= mod # 文字修饰符 - fcur = fcur_ - x += adv - if log.isEnabledFor(logging.DEBUG): - lstk.append(LTLine(0.1, (_x, _y), (x, y))) - _x, _y = x, y - # 处理结尾 - if cstk: - ops_vals.append({ - "type": OpType.TEXT, - "font": fcur, - "size": size, - "x": tx, - "dy": 0, - "rtxt": raw_string(fcur, cstk), - "lidx": lidx - }) - - line_height = default_line_height - - while (lidx + 1) * size * line_height > height and line_height >= 1: - line_height -= 0.05 - - for vals in ops_vals: - if vals["type"] == OpType.TEXT: - ops_list.append(gen_op_txt(vals["font"], vals["size"], vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["rtxt"])) - elif vals["type"] == OpType.LINE: - ops_list.append(gen_op_line(vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["xlen"], vals["ylen"], vals["linewidth"])) - - for l in lstk: # 排版全局线条 - if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景 - ops_list.append(gen_op_line(l.pts[0][0], l.pts[0][1], l.pts[1][0] - l.pts[0][0], l.pts[1][1] - l.pts[0][1], l.linewidth)) - - ops = f"BT {''.join(ops_list)}ET " - return ops - - -class OpType(Enum): - TEXT = "text" - LINE = "line" +from typing import Dict, List +from enum import Enum + +from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager +from pdfminer.pdffont import PDFCIDFont +from pdfminer.converter import PDFConverter +from pdfminer.pdffont import PDFUnicodeNotDefined +from pdfminer.utils import apply_matrix_pt, mult_matrix +from pdfminer.layout import ( + LTChar, + LTFigure, + LTLine, + LTPage, +) +import logging +import re +import concurrent.futures +import numpy as np +import unicodedata +from tenacity import retry, wait_fixed +from pdf2zh.translator import ( + AzureOpenAITranslator, + BaseTranslator, + GoogleTranslator, + BingTranslator, + DeepLTranslator, + DeepLXTranslator, + OllamaTranslator, + OpenAITranslator, + ZhipuTranslator, + ModelScopeTranslator, + SiliconTranslator, + GeminiTranslator, + AzureTranslator, + TencentTranslator, + DifyTranslator, + AnythingLLMTranslator, + XinferenceTranslator, + ArgosTranslator, + GorkTranslator, + GroqTranslator, + DeepseekTranslator, + OpenAIlikedTranslator, +) +from pymupdf import Font + +log = logging.getLogger(__name__) + + +class PDFConverterEx(PDFConverter): + def __init__( + self, + rsrcmgr: PDFResourceManager, + ) -> None: + PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None) + + def begin_page(self, page, ctm) -> None: + # 重载替换 cropbox + (x0, y0, x1, y1) = page.cropbox + (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) + (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) + mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) + self.cur_item = LTPage(page.pageno, mediabox) + + def end_page(self, page): + # 重载返回指令流 + return self.receive_layout(self.cur_item) + + def begin_figure(self, name, bbox, matrix) -> None: + # 重载设置 pageid + self._stack.append(self.cur_item) + self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) + self.cur_item.pageid = self._stack[-1].pageid + + def end_figure(self, _: str) -> None: + # 重载返回指令流 + fig = self.cur_item + assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) + self.cur_item = self._stack.pop() + self.cur_item.add(fig) + return self.receive_layout(fig) + + def render_char( + self, + matrix, + font, + fontsize: float, + scaling: float, + rise: float, + cid: int, + ncs, + graphicstate: PDFGraphicState, + ) -> float: + # 重载设置 cid 和 font + try: + text = font.to_unichr(cid) + assert isinstance(text, str), str(type(text)) + except PDFUnicodeNotDefined: + text = self.handle_undefined_char(font, cid) + textwidth = font.char_width(cid) + textdisp = font.char_disp(cid) + item = LTChar( + matrix, + font, + fontsize, + scaling, + rise, + text, + textwidth, + textdisp, + ncs, + graphicstate, + ) + self.cur_item.add(item) + item.cid = cid # hack 插入原字符编码 + item.font = font # hack 插入原字符字体 + return item.adv + + +class Paragraph: + def __init__(self, y, x, x0, x1, y0, y1, size, brk): + self.y: float = y # 初始纵坐标 + self.x: float = x # 初始横坐标 + self.x0: float = x0 # 左边界 + self.x1: float = x1 # 右边界 + self.y0: float = y0 # 上边界 + self.y1: float = y1 # 下边界 + self.size: float = size # 字体大小 + self.brk: bool = brk # 换行标记 + + +# fmt: off +class TranslateConverter(PDFConverterEx): + def __init__( + self, + rsrcmgr, + vfont: str = None, + vchar: str = None, + thread: int = 0, + layout={}, + lang_in: str = "", + lang_out: str = "", + service: str = "", + noto_name: str = "", + noto: Font = None, + envs: Dict = None, + prompt: List = None, + ) -> None: + super().__init__(rsrcmgr) + self.vfont = vfont + self.vchar = vchar + self.thread = thread + self.layout = layout + self.noto_name = noto_name + self.noto = noto + self.translator: BaseTranslator = None + param = service.split(":", 1) + service_name = param[0] + service_model = param[1] if len(param) > 1 else None + if not envs: + envs = {} + if not prompt: + prompt = [] + for translator in [GoogleTranslator, BingTranslator, DeepLTranslator, DeepLXTranslator, OllamaTranslator, XinferenceTranslator, AzureOpenAITranslator, + OpenAITranslator, ZhipuTranslator, ModelScopeTranslator, SiliconTranslator, GeminiTranslator, AzureTranslator, TencentTranslator, DifyTranslator, AnythingLLMTranslator, ArgosTranslator, GorkTranslator, GroqTranslator, DeepseekTranslator, OpenAIlikedTranslator,]: + if service_name == translator.name: + self.translator = translator(lang_in, lang_out, service_model, envs=envs, prompt=prompt) + if not self.translator: + raise ValueError("Unsupported translation service") + + def receive_layout(self, ltpage: LTPage): + # 段落 + sstk: list[str] = [] # 段落文字栈 + pstk: list[Paragraph] = [] # 段落属性栈 + vbkt: int = 0 # 段落公式括号计数 + # 公式组 + vstk: list[LTChar] = [] # 公式符号组 + vlstk: list[LTLine] = [] # 公式线条组 + vfix: float = 0 # 公式纵向偏移 + # 公式组栈 + var: list[list[LTChar]] = [] # 公式符号组栈 + varl: list[list[LTLine]] = [] # 公式线条组栈 + varf: list[float] = [] # 公式纵向偏移栈 + vlen: list[float] = [] # 公式宽度栈 + # 全局 + lstk: list[LTLine] = [] # 全局线条栈 + xt: LTChar = None # 上一个字符 + xt_cls: int = -1 # 上一个字符所属段落,保证无论第一个字符属于哪个类别都可以触发新段落 + vmax: float = ltpage.width / 4 # 行内公式最大宽度 + ops: str = "" # 渲染结果 + + def vflag(font: str, char: str): # 匹配公式(和角标)字体 + if isinstance(font, bytes): # 不一定能 decode,直接转 str + try: + font = font.decode('utf-8') # 尝试使用 UTF-8 解码 + except UnicodeDecodeError: + font = "" + font = font.split("+")[-1] # 字体名截断 + if re.match(r"\(cid:", char): + return True + # 基于字体名规则的判定 + if self.vfont: + if re.match(self.vfont, font): + return True + else: + if re.match( # latex 字体 + r"(CM[^R]|MS.M|XY|MT|BL|RM|EU|LA|RS|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)", + font, + ): + return True + # 基于字符集规则的判定 + if self.vchar: + if re.match(self.vchar, char): + return True + else: + if ( + char + and char != " " # 非空格 + and ( + unicodedata.category(char[0]) + in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号 + or ord(char[0]) in range(0x370, 0x400) # 希腊字母 + ) + ): + return True + return False + + ############################################################ + # A. 原文档解析 + for child in ltpage: + if isinstance(child, LTChar): + cur_v = False + layout = self.layout[ltpage.pageid] + # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape + h, w = layout.shape + # 读取当前字符在 layout 中的类别 + cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1) + cls = layout[cy, cx] + # 锚定文档中 bullet 的位置 + if child.get_text() == "•": + cls = 0 + # 判定当前字符是否属于公式 + if ( # 判定当前字符是否属于公式 + cls == 0 # 1. 类别为保留区域 + or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况 + or vflag(child.fontname, child.get_text()) # 3. 公式字体 + or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体 + ): + cur_v = True + # 判定括号组是否属于公式 + if not cur_v: + if vstk and child.get_text() == "(": + cur_v = True + vbkt += 1 + if vbkt and child.get_text() == ")": + cur_v = True + vbkt -= 1 + if ( # 判定当前公式是否结束 + not cur_v # 1. 当前字符不属于公式 + or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落 + # or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分 + # 禁止纯公式(代码)段落换行,直到文字开始再重开文字段落,保证只存在两种情况 + # A. 纯公式(代码)段落(锚定绝对位置)sstk[-1]=="" -> sstk[-1]=="{v*}" + # B. 文字开头段落(排版相对位置)sstk[-1]!="" + or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax) # 因为 cls==xt_cls==0 一定有 sstk[-1]=="",所以这里不需要再判定 cls!=0 + ): + if vstk: + if ( # 根据公式右侧的文字修正公式的纵向偏移 + not cur_v # 1. 当前字符不属于公式 + and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 + and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧 + ): + vfix = vstk[0].y0 - child.y0 + if sstk[-1] == "": + xt_cls = -1 # 禁止纯公式段落(sstk[-1]=="{v*}")的后续连接,但是要考虑新字符和后续字符的连接,所以这里修改的是上个字符的类别 + sstk[-1] += f"{{v{len(var)}}}" + var.append(vstk) + varl.append(vlstk) + varf.append(vfix) + vstk = [] + vlstk = [] + vfix = 0 + # 当前字符不属于公式或当前字符是公式的第一个字符 + if not vstk: + if cls == xt_cls: # 当前字符与前一个字符属于同一段落 + if child.x0 > xt.x1 + 1: # 添加行内空格 + sstk[-1] += " " + elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行 + sstk[-1] += " " + pstk[-1].brk = True + else: # 根据当前字符构建一个新的段落 + sstk.append("") + pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.y0, child.y1, child.size, False)) + if not cur_v: # 文字入栈 + if ( # 根据当前字符修正段落属性 + child.size > pstk[-1].size # 1. 当前字符比段落字体大 + or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况) + ) and child.get_text() != " ": # 3. 当前字符不是空格 + pstk[-1].y -= child.size - pstk[-1].size # 修正段落初始纵坐标,假设两个不同大小字符的上边界对齐 + pstk[-1].size = child.size + sstk[-1] += child.get_text() + else: # 公式入栈 + if ( # 根据公式左侧的文字修正公式的纵向偏移 + not vstk # 1. 当前字符是公式的第一个字符 + and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 + and child.x0 > xt.x0 # 3. 前一个字符在公式左侧 + ): + vfix = child.y0 - xt.y0 + vstk.append(child) + # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理 + pstk[-1].x0 = min(pstk[-1].x0, child.x0) + pstk[-1].x1 = max(pstk[-1].x1, child.x1) + pstk[-1].y0 = min(pstk[-1].y0, child.y0) + pstk[-1].y1 = max(pstk[-1].y1, child.y1) + # 更新上一个字符 + xt = child + xt_cls = cls + elif isinstance(child, LTFigure): # 图表 + pass + elif isinstance(child, LTLine): # 线条 + layout = self.layout[ltpage.pageid] + # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape + h, w = layout.shape + # 读取当前线条在 layout 中的类别 + cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1) + cls = layout[cy, cx] + if vstk and cls == xt_cls: # 公式线条 + vlstk.append(child) + else: # 全局线条 + lstk.append(child) + else: + pass + # 处理结尾 + if vstk: # 公式出栈 + sstk[-1] += f"{{v{len(var)}}}" + var.append(vstk) + varl.append(vlstk) + varf.append(vfix) + log.debug("\n==========[VSTACK]==========\n") + for id, v in enumerate(var): # 计算公式宽度 + l = max([vch.x1 for vch in v]) - v[0].x0 + log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > v{id} = {"".join([ch.get_text() for ch in v])}') + vlen.append(l) + + ############################################################ + # B. 段落翻译 + log.debug("\n==========[SSTACK]==========\n") + + @retry(wait=wait_fixed(1)) + def worker(s: str): # 多线程翻译 + if not s.strip() or re.match(r"^\{v\d+\}$", s): # 空白和公式不翻译 + return s + try: + new = self.translator.translate(s) + return new + except BaseException as e: + if log.isEnabledFor(logging.DEBUG): + log.exception(e) + else: + log.exception(e, exc_info=False) + raise e + with concurrent.futures.ThreadPoolExecutor( + max_workers=self.thread + ) as executor: + news = list(executor.map(worker, sstk)) + + ############################################################ + # C. 新文档排版 + def raw_string(fcur: str, cstk: str): # 编码字符串 + if fcur == self.noto_name: + return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk]) + elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度 + return "".join(["%04x" % ord(c) for c in cstk]) + else: + return "".join(["%02x" % ord(c) for c in cstk]) + + # 根据目标语言获取默认行距 + LANG_LINEHEIGHT_MAP = { + "zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, + "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8 + } + default_line_height = LANG_LINEHEIGHT_MAP.get(self.translator.lang_out.lower(), 1.1) # 小语种默认1.1 + _x, _y = 0, 0 + ops_list = [] + + def gen_op_txt(font, size, x, y, rtxt): + return f"/{font} {size:f} Tf 1 0 0 1 {x:f} {y:f} Tm [<{rtxt}>] TJ " + + def gen_op_line(x, y, xlen, ylen, linewidth): + return f"ET q 1 0 0 1 {x:f} {y:f} cm [] 0 d 0 J {linewidth:f} w 0 0 m {xlen:f} {ylen:f} l S Q BT " + + for id, new in enumerate(news): + x: float = pstk[id].x # 段落初始横坐标 + y: float = pstk[id].y # 段落初始纵坐标 + x0: float = pstk[id].x0 # 段落左边界 + x1: float = pstk[id].x1 # 段落右边界 + height: float = pstk[id].y1 - pstk[id].y0 # 段落高度 + size: float = pstk[id].size # 段落字体大小 + brk: bool = pstk[id].brk # 段落换行标记 + cstk: str = "" # 当前文字栈 + fcur: str = None # 当前字体 ID + lidx = 0 # 记录换行次数 + tx = x + fcur_ = fcur + ptr = 0 + log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}") + + ops_vals: list[dict] = [] + + while ptr < len(new): + vy_regex = re.match( + r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE + ) # 匹配 {vn} 公式标记 + mod = 0 # 文字修饰符 + if vy_regex: # 加载公式 + ptr += len(vy_regex.group(0)) + try: + vid = int(vy_regex.group(1).replace(" ", "")) + adv = vlen[vid] + except Exception: + continue # 翻译器可能会自动补个越界的公式标记 + if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符 + mod = var[vid][-1].width + else: # 加载文字 + ch = new[ptr] + fcur_ = None + try: + if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch: + fcur_ = "tiro" # 默认拉丁字体 + except Exception: + pass + if fcur_ is None: + fcur_ = self.noto_name # 默认非拉丁字体 + if fcur_ == self.noto_name: # FIXME: change to CONST + adv = self.noto.char_lengths(ch, size)[0] + else: + adv = self.fontmap[fcur_].char_width(ord(ch)) * size + ptr += 1 + if ( # 输出文字缓冲区 + fcur_ != fcur # 1. 字体更新 + or vy_regex # 2. 插入公式 + or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差) + ): + if cstk: + ops_vals.append({ + "type": OpType.TEXT, + "font": fcur, + "size": size, + "x": tx, + "dy": 0, + "rtxt": raw_string(fcur, cstk), + "lidx": lidx + }) + cstk = "" + if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行 + x = x0 + lidx += 1 + if vy_regex: # 插入公式 + fix = 0 + if fcur is not None: # 段落内公式修正纵向偏移 + fix = varf[vid] + for vch in var[vid]: # 排版公式字符 + vc = chr(vch.cid) + ops_vals.append({ + "type": OpType.TEXT, + "font": self.fontid[vch.font], + "size": vch.size, + "x": x + vch.x0 - var[vid][0].x0, + "dy": fix + vch.y0 - var[vid][0].y0, + "rtxt": raw_string(self.fontid[vch.font], vc), + "lidx": lidx + }) + if log.isEnabledFor(logging.DEBUG): + lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0))) + _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0 + for l in varl[vid]: # 排版公式线条 + if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景 + ops_vals.append({ + "type": OpType.LINE, + "x": l.pts[0][0] + x - var[vid][0].x0, + "dy": l.pts[0][1] + fix - var[vid][0].y0, + "linewidth": l.linewidth, + "xlen": l.pts[1][0] - l.pts[0][0], + "ylen": l.pts[1][1] - l.pts[0][1], + "lidx": lidx + }) + else: # 插入文字缓冲区 + if not cstk: # 单行开头 + tx = x + if x == x0 and ch == " ": # 消除段落换行空格 + adv = 0 + else: + cstk += ch + else: + cstk += ch + adv -= mod # 文字修饰符 + fcur = fcur_ + x += adv + if log.isEnabledFor(logging.DEBUG): + lstk.append(LTLine(0.1, (_x, _y), (x, y))) + _x, _y = x, y + # 处理结尾 + if cstk: + ops_vals.append({ + "type": OpType.TEXT, + "font": fcur, + "size": size, + "x": tx, + "dy": 0, + "rtxt": raw_string(fcur, cstk), + "lidx": lidx + }) + + line_height = default_line_height + + while (lidx + 1) * size * line_height > height and line_height >= 1: + line_height -= 0.05 + + for vals in ops_vals: + if vals["type"] == OpType.TEXT: + ops_list.append(gen_op_txt(vals["font"], vals["size"], vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["rtxt"])) + elif vals["type"] == OpType.LINE: + ops_list.append(gen_op_line(vals["x"], vals["dy"] + y - vals["lidx"] * size * line_height, vals["xlen"], vals["ylen"], vals["linewidth"])) + + for l in lstk: # 排版全局线条 + if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景 + ops_list.append(gen_op_line(l.pts[0][0], l.pts[0][1], l.pts[1][0] - l.pts[0][0], l.pts[1][1] - l.pts[0][1], l.linewidth)) + + ops = f"BT {''.join(ops_list)}ET " + return ops + + +class OpType(Enum): + TEXT = "text" + LINE = "line" diff --git a/pdf2zh/doclayout.py b/pdf2zh/doclayout.py index 047773b8..feb6cafd 100644 --- a/pdf2zh/doclayout.py +++ b/pdf2zh/doclayout.py @@ -1,181 +1,181 @@ -import abc -import os.path - -import cv2 -import numpy as np -import ast -import onnx -import onnxruntime -from huggingface_hub import hf_hub_download - -from pdf2zh.config import ConfigManager - - -class DocLayoutModel(abc.ABC): - @staticmethod - def load_onnx(): - model = OnnxModel.from_pretrained( - repo_id="wybxc/DocLayout-YOLO-DocStructBench-onnx", - filename="doclayout_yolo_docstructbench_imgsz1024.onnx", - ) - return model - - @staticmethod - def load_available(): - return DocLayoutModel.load_onnx() - - @property - @abc.abstractmethod - def stride(self) -> int: - """Stride of the model input.""" - pass - - @abc.abstractmethod - def predict(self, image, imgsz=1024, **kwargs) -> list: - """ - Predict the layout of a document page. - - Args: - image: The image of the document page. - imgsz: Resize the image to this size. Must be a multiple of the stride. - **kwargs: Additional arguments. - """ - pass - - -class YoloResult: - """Helper class to store detection results from ONNX model.""" - - def __init__(self, boxes, names): - self.boxes = [YoloBox(data=d) for d in boxes] - self.boxes.sort(key=lambda x: x.conf, reverse=True) - self.names = names - - -class YoloBox: - """Helper class to store detection results from ONNX model.""" - - def __init__(self, data): - self.xyxy = data[:4] - self.conf = data[-2] - self.cls = data[-1] - - -class OnnxModel(DocLayoutModel): - def __init__(self, model_path: str): - self.model_path = model_path - - model = onnx.load(model_path) - metadata = {d.key: d.value for d in model.metadata_props} - self._stride = ast.literal_eval(metadata["stride"]) - self._names = ast.literal_eval(metadata["names"]) - - self.model = onnxruntime.InferenceSession(model.SerializeToString()) - - @staticmethod - def from_pretrained(repo_id: str, filename: str): - if ConfigManager.get("USE_MODELSCOPE", "0") == "1": - repo_mapping = { - # Edit here to add more models - "wybxc/DocLayout-YOLO-DocStructBench-onnx": "AI-ModelScope/DocLayout-YOLO-DocStructBench-onnx" - } - from modelscope import snapshot_download - - model_dir = snapshot_download(repo_mapping[repo_id]) - pth = os.path.join(model_dir, filename) - else: - pth = hf_hub_download(repo_id=repo_id, filename=filename, etag_timeout=1) - return OnnxModel(pth) - - @property - def stride(self): - return self._stride - - def resize_and_pad_image(self, image, new_shape): - """ - Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. - - Parameters: - - image: Input image - - new_shape: Target size (integer or (height, width) tuple) - - stride: Padding alignment stride, default 32 - - Returns: - - Processed image - """ - if isinstance(new_shape, int): - new_shape = (new_shape, new_shape) - - h, w = image.shape[:2] - new_h, new_w = new_shape - - # Calculate scaling ratio - r = min(new_h / h, new_w / w) - resized_h, resized_w = int(round(h * r)), int(round(w * r)) - - # Resize image - image = cv2.resize( - image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR - ) - - # Calculate padding size and align to stride multiple - pad_w = (new_w - resized_w) % self.stride - pad_h = (new_h - resized_h) % self.stride - top, bottom = pad_h // 2, pad_h - pad_h // 2 - left, right = pad_w // 2, pad_w - pad_w // 2 - - # Add padding - image = cv2.copyMakeBorder( - image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) - ) - - return image - - def scale_boxes(self, img1_shape, boxes, img0_shape): - """ - Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally - specified in (img1_shape) to the shape of a different image (img0_shape). - - Args: - img1_shape (tuple): The shape of the image that the bounding boxes are for, - in the format of (height, width). - boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) - img0_shape (tuple): the shape of the target image, in the format of (height, width). - - Returns: - boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) - """ - - # Calculate scaling ratio - gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) - - # Calculate padding size - pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) - pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) - - # Remove padding and scale boxes - boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain - return boxes - - def predict(self, image, imgsz=1024, **kwargs): - # Preprocess input image - orig_h, orig_w = image.shape[:2] - pix = self.resize_and_pad_image(image, new_shape=imgsz) - pix = np.transpose(pix, (2, 0, 1)) # CHW - pix = np.expand_dims(pix, axis=0) # BCHW - pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1] - new_h, new_w = pix.shape[2:] - - # Run inference - preds = self.model.run(None, {"images": pix})[0] - - # Postprocess predictions - preds = preds[preds[..., 4] > 0.25] - preds[..., :4] = self.scale_boxes( - (new_h, new_w), preds[..., :4], (orig_h, orig_w) - ) - return [YoloResult(boxes=preds, names=self._names)] - - -class ModelInstance: - value: OnnxModel = None +import abc +import os.path + +import cv2 +import numpy as np +import ast +import onnx +import onnxruntime +from huggingface_hub import hf_hub_download + +from pdf2zh.config import ConfigManager + + +class DocLayoutModel(abc.ABC): + @staticmethod + def load_onnx(): + model = OnnxModel.from_pretrained( + repo_id="wybxc/DocLayout-YOLO-DocStructBench-onnx", + filename="doclayout_yolo_docstructbench_imgsz1024.onnx", + ) + return model + + @staticmethod + def load_available(): + return DocLayoutModel.load_onnx() + + @property + @abc.abstractmethod + def stride(self) -> int: + """Stride of the model input.""" + pass + + @abc.abstractmethod + def predict(self, image, imgsz=1024, **kwargs) -> list: + """ + Predict the layout of a document page. + + Args: + image: The image of the document page. + imgsz: Resize the image to this size. Must be a multiple of the stride. + **kwargs: Additional arguments. + """ + pass + + +class YoloResult: + """Helper class to store detection results from ONNX model.""" + + def __init__(self, boxes, names): + self.boxes = [YoloBox(data=d) for d in boxes] + self.boxes.sort(key=lambda x: x.conf, reverse=True) + self.names = names + + +class YoloBox: + """Helper class to store detection results from ONNX model.""" + + def __init__(self, data): + self.xyxy = data[:4] + self.conf = data[-2] + self.cls = data[-1] + + +class OnnxModel(DocLayoutModel): + def __init__(self, model_path: str): + self.model_path = model_path + + model = onnx.load(model_path) + metadata = {d.key: d.value for d in model.metadata_props} + self._stride = ast.literal_eval(metadata["stride"]) + self._names = ast.literal_eval(metadata["names"]) + + self.model = onnxruntime.InferenceSession(model.SerializeToString()) + + @staticmethod + def from_pretrained(repo_id: str, filename: str): + if ConfigManager.get("USE_MODELSCOPE", "0") == "1": + repo_mapping = { + # Edit here to add more models + "wybxc/DocLayout-YOLO-DocStructBench-onnx": "AI-ModelScope/DocLayout-YOLO-DocStructBench-onnx" + } + from modelscope import snapshot_download + + model_dir = snapshot_download(repo_mapping[repo_id]) + pth = os.path.join(model_dir, filename) + else: + pth = hf_hub_download(repo_id=repo_id, filename=filename, etag_timeout=1) + return OnnxModel(pth) + + @property + def stride(self): + return self._stride + + def resize_and_pad_image(self, image, new_shape): + """ + Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. + + Parameters: + - image: Input image + - new_shape: Target size (integer or (height, width) tuple) + - stride: Padding alignment stride, default 32 + + Returns: + - Processed image + """ + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + + h, w = image.shape[:2] + new_h, new_w = new_shape + + # Calculate scaling ratio + r = min(new_h / h, new_w / w) + resized_h, resized_w = int(round(h * r)), int(round(w * r)) + + # Resize image + image = cv2.resize( + image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR + ) + + # Calculate padding size and align to stride multiple + pad_w = (new_w - resized_w) % self.stride + pad_h = (new_h - resized_h) % self.stride + top, bottom = pad_h // 2, pad_h - pad_h // 2 + left, right = pad_w // 2, pad_w - pad_w // 2 + + # Add padding + image = cv2.copyMakeBorder( + image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) + ) + + return image + + def scale_boxes(self, img1_shape, boxes, img0_shape): + """ + Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally + specified in (img1_shape) to the shape of a different image (img0_shape). + + Args: + img1_shape (tuple): The shape of the image that the bounding boxes are for, + in the format of (height, width). + boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) + img0_shape (tuple): the shape of the target image, in the format of (height, width). + + Returns: + boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) + """ + + # Calculate scaling ratio + gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) + + # Calculate padding size + pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) + pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) + + # Remove padding and scale boxes + boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain + return boxes + + def predict(self, image, imgsz=1024, **kwargs): + # Preprocess input image + orig_h, orig_w = image.shape[:2] + pix = self.resize_and_pad_image(image, new_shape=imgsz) + pix = np.transpose(pix, (2, 0, 1)) # CHW + pix = np.expand_dims(pix, axis=0) # BCHW + pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1] + new_h, new_w = pix.shape[2:] + + # Run inference + preds = self.model.run(None, {"images": pix})[0] + + # Postprocess predictions + preds = preds[preds[..., 4] > 0.25] + preds[..., :4] = self.scale_boxes( + (new_h, new_w), preds[..., :4], (orig_h, orig_w) + ) + return [YoloResult(boxes=preds, names=self._names)] + + +class ModelInstance: + value: OnnxModel = None diff --git a/pdf2zh/gui.py b/pdf2zh/gui.py index 6f31e997..24e24ac4 100644 --- a/pdf2zh/gui.py +++ b/pdf2zh/gui.py @@ -1,730 +1,694 @@ -import asyncio -import cgi -import os -import shutil -import uuid -from asyncio import CancelledError -from pathlib import Path - -import gradio as gr -import requests -import tqdm -from gradio_pdf import PDF -from string import Template - -from pdf2zh import __version__ -from pdf2zh.high_level import translate -from pdf2zh.doclayout import ModelInstance -from pdf2zh.config import ConfigManager -from pdf2zh.translator import ( - AnythingLLMTranslator, - AzureOpenAITranslator, - AzureTranslator, - BaseTranslator, - BingTranslator, - DeepLTranslator, - DeepLXTranslator, - DifyTranslator, - ArgosTranslator, - GeminiTranslator, - GoogleTranslator, - ModelScopeTranslator, - OllamaTranslator, - OpenAITranslator, - SiliconTranslator, - TencentTranslator, - XinferenceTranslator, - ZhipuTranslator, - GorkTranslator, - GroqTranslator, - DeepseekTranslator, - OpenAIlikedTranslator, -) - -# The following variables associate strings with translators -service_map: dict[str, BaseTranslator] = { - "Google": GoogleTranslator, - "Bing": BingTranslator, - "DeepL": DeepLTranslator, - "DeepLX": DeepLXTranslator, - "Ollama": OllamaTranslator, - "Xinference": XinferenceTranslator, - "AzureOpenAI": AzureOpenAITranslator, - "OpenAI": OpenAITranslator, - "Zhipu": ZhipuTranslator, - "ModelScope": ModelScopeTranslator, - "Silicon": SiliconTranslator, - "Gemini": GeminiTranslator, - "Azure": AzureTranslator, - "Tencent": TencentTranslator, - "Dify": DifyTranslator, - "AnythingLLM": AnythingLLMTranslator, - "Argos Translate": ArgosTranslator, - "Gork": GorkTranslator, - "Groq": GroqTranslator, - "DeepSeek": DeepseekTranslator, - "OpenAI-liked": OpenAIlikedTranslator, -} - -# The following variables associate strings with specific languages -lang_map = { - "Simplified Chinese": "zh", - "Traditional Chinese": "zh-TW", - "English": "en", - "French": "fr", - "German": "de", - "Japanese": "ja", - "Korean": "ko", - "Russian": "ru", - "Spanish": "es", - "Italian": "it", -} - -# The following variable associate strings with page ranges -page_map = { - "All": None, - "First": [0], - "First 5 pages": list(range(0, 5)), - "Others": None, -} - -# Check if this is a public demo, which has resource limits -flag_demo = False - -# Limit resources -if ConfigManager.get("PDF2ZH_DEMO"): - flag_demo = True - service_map = { - "Google": GoogleTranslator, - } - page_map = { - "First": [0], - "First 20 pages": list(range(0, 20)), - } - client_key = ConfigManager.get("PDF2ZH_CLIENT_KEY") - server_key = ConfigManager.get("PDF2ZH_SERVER_KEY") - - -# Public demo control -def verify_recaptcha(response): - """ - This function verifies the reCAPTCHA response. - """ - recaptcha_url = "https://www.google.com/recaptcha/api/siteverify" - print("reCAPTCHA", server_key, response) - data = {"secret": server_key, "response": response} - result = requests.post(recaptcha_url, data=data).json() - print("reCAPTCHA", result.get("success")) - return result.get("success") - - -def download_with_limit(url: str, save_path: str, size_limit: int) -> str: - """ - This function downloads a file from a URL and saves it to a specified path. - - Inputs: - - url: The URL to download the file from - - save_path: The path to save the file to - - size_limit: The maximum size of the file to download - - Returns: - - The path of the downloaded file - """ - chunk_size = 1024 - total_size = 0 - with requests.get(url, stream=True, timeout=10) as response: - response.raise_for_status() - content = response.headers.get("Content-Disposition") - try: # filename from header - _, params = cgi.parse_header(content) - filename = params["filename"] - except Exception: # filename from url - filename = os.path.basename(url) - with open(save_path / filename, "wb") as file: - for chunk in response.iter_content(chunk_size=chunk_size): - total_size += len(chunk) - if size_limit and total_size > size_limit: - raise gr.Error("Exceeds file size limit") - file.write(chunk) - return save_path / filename - - -def stop_translate_file(state: dict) -> None: - """ - This function stops the translation process. - - Inputs: - - state: The state of the translation process - - Returns:- None - """ - session_id = state["session_id"] - if session_id is None: - return - if session_id in cancellation_event_map: - cancellation_event_map[session_id].set() - - -def translate_file( - file_type, - file_input, - link_input, - service, - lang_from, - lang_to, - page_range, - page_input, - prompt, - threads, - recaptcha_response, - state, - progress=gr.Progress(), - *envs, -): - """ - This function translates a PDF file from one language to another. - - Inputs: - - file_type: The type of file to translate - - file_input: The file to translate - - link_input: The link to the file to translate - - service: The translation service to use - - lang_from: The language to translate from - - lang_to: The language to translate to - - page_range: The range of pages to translate - - page_input: The input for the page range - - prompt: The custom prompt for the llm - - threads: The number of threads to use - - recaptcha_response: The reCAPTCHA response - - state: The state of the translation process - - progress: The progress bar - - envs: The environment variables - - Returns: - - The translated file - - The translated file - - The translated file - - The progress bar - - The progress bar - - The progress bar - """ - session_id = uuid.uuid4() - state["session_id"] = session_id - cancellation_event_map[session_id] = asyncio.Event() - # Translate PDF content using selected service. - if flag_demo and not verify_recaptcha(recaptcha_response): - raise gr.Error("reCAPTCHA fail") - - progress(0, desc="Starting translation...") - - output = Path("pdf2zh_files") - output.mkdir(parents=True, exist_ok=True) - - if file_type == "File": - if not file_input: - raise gr.Error("No input") - file_path = shutil.copy(file_input, output) - else: - if not link_input: - raise gr.Error("No input") - file_path = download_with_limit( - link_input, - output, - 5 * 1024 * 1024 if flag_demo else None, - ) - - filename = os.path.splitext(os.path.basename(file_path))[0] - file_raw = output / f"{filename}.pdf" - file_mono = output / f"{filename}-mono.pdf" - file_dual = output / f"{filename}-dual.pdf" - - translator = service_map[service] - if page_range != "Others": - selected_page = page_map[page_range] - else: - selected_page = [] - for p in page_input.split(","): - if "-" in p: - start, end = p.split("-") - selected_page.extend(range(int(start) - 1, int(end))) - else: - selected_page.append(int(p) - 1) - lang_from = lang_map[lang_from] - lang_to = lang_map[lang_to] - - _envs = {} - for i, env in enumerate(translator.envs.items()): - _envs[env[0]] = envs[i] - - print(f"Files before translation: {os.listdir(output)}") - - def progress_bar(t: tqdm.tqdm): - progress(t.n / t.total, desc="Translating...") - - try: - threads = int(threads) - except ValueError: - threads = 1 - - param = { - "files": [str(file_raw)], - "pages": selected_page, - "lang_in": lang_from, - "lang_out": lang_to, - "service": f"{translator.name}", - "output": output, - "thread": int(threads), - "callback": progress_bar, - "cancellation_event": cancellation_event_map[session_id], - "envs": _envs, - "prompt": Template(prompt), - "model": ModelInstance.value, - } - try: - translate(**param) - except CancelledError: - del cancellation_event_map[session_id] - raise gr.Error("Translation cancelled") - print(f"Files after translation: {os.listdir(output)}") - - if not file_mono.exists() or not file_dual.exists(): - raise gr.Error("No output") - - progress(1.0, desc="Translation complete!") - - return ( - str(file_mono), - str(file_mono), - str(file_dual), - gr.update(visible=True), - gr.update(visible=True), - gr.update(visible=True), - ) - - -# Global setup -custom_blue = gr.themes.Color( - c50="#E8F3FF", - c100="#BEDAFF", - c200="#94BFFF", - c300="#6AA1FF", - c400="#4080FF", - c500="#165DFF", # Primary color - c600="#0E42D2", - c700="#0A2BA6", - c800="#061D79", - c900="#03114D", - c950="#020B33", -) - -custom_css = """ - .secondary-text {color: #999 !important;} - footer {visibility: hidden} - .env-warning {color: #dd5500 !important;} - .env-success {color: #559900 !important;} - - /* Add dashed border to input-file class */ - .input-file { - border: 1.2px dashed #165DFF !important; - border-radius: 6px !important; - } - - .progress-bar-wrap { - border-radius: 8px !important; - } - - .progress-bar { - border-radius: 8px !important; - } - - .pdf-canvas canvas { - width: 100%; - } - """ - -demo_recaptcha = """ - - - """ - -tech_details_string = f""" - Technical details - - GitHub: Byaidu/PDFMathTranslate
- - GUI by: Rongxin
- - Version: {__version__} - """ -cancellation_event_map = {} - - -# The following code creates the GUI -with gr.Blocks( - title="PDFMathTranslate - PDF Translation with preserved formats", - theme=gr.themes.Default( - primary_hue=custom_blue, spacing_size="md", radius_size="lg" - ), - css=custom_css, - head=demo_recaptcha if flag_demo else "", -) as demo: - gr.Markdown( - "# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)" - ) - - with gr.Row(): - with gr.Column(scale=1): - gr.Markdown("## File | < 5 MB" if flag_demo else "## File") - file_type = gr.Radio( - choices=["File", "Link"], - label="Type", - value="File", - ) - file_input = gr.File( - label="File", - file_count="single", - file_types=[".pdf"], - type="filepath", - elem_classes=["input-file"], - ) - link_input = gr.Textbox( - label="Link", - visible=False, - interactive=True, - ) - gr.Markdown("## Option") - service = gr.Dropdown( - label="Service", - choices=service_map.keys(), - value="Google", - ) - envs = [] - for i in range(3): - envs.append( - gr.Textbox( - visible=False, - interactive=True, - ) - ) - with gr.Row(): - lang_from = gr.Dropdown( - label="Translate from", - choices=lang_map.keys(), - value=ConfigManager.get("PDF2ZH_LANG_FROM", "English"), - ) - lang_to = gr.Dropdown( - label="Translate to", - choices=lang_map.keys(), - value=ConfigManager.get("PDF2ZH_LANG_TO", "Simplified Chinese"), - ) - page_range = gr.Radio( - choices=page_map.keys(), - label="Pages", - value=list(page_map.keys())[0], - ) - - page_input = gr.Textbox( - label="Page range", - visible=False, - interactive=True, - ) - - with gr.Accordion("Open for More Experimental Options!", open=False): - gr.Markdown("#### Experimental") - threads = gr.Textbox( - label="number of threads", interactive=True, value="1" - ) - prompt = gr.Textbox( - label="Custom Prompt for llm", interactive=True, visible=False - ) - envs.append(prompt) - - def on_select_service(service, evt: gr.EventData): - translator = service_map[service] - _envs = [] - for i in range(4): - _envs.append(gr.update(visible=False, value="")) - for i, env in enumerate(translator.envs.items()): - _envs[i] = gr.update( - visible=True, - label=env[0], - value=ConfigManager.get_env_by_translatername( - translator, env[0], env[1] - ), - ) - _envs[-1] = gr.update(visible=translator.CustomPrompt) - return _envs - - def on_select_filetype(file_type): - return ( - gr.update(visible=file_type == "File"), - gr.update(visible=file_type == "Link"), - ) - - def on_select_page(choice): - if choice == "Others": - return gr.update(visible=True) - else: - return gr.update(visible=False) - - output_title = gr.Markdown("## Translated", visible=False) - output_file_mono = gr.File( - label="Download Translation (Mono)", visible=False - ) - output_file_dual = gr.File( - label="Download Translation (Dual)", visible=False - ) - recaptcha_response = gr.Textbox( - label="reCAPTCHA Response", elem_id="verify", visible=False - ) - recaptcha_box = gr.HTML('
') - translate_btn = gr.Button("Translate", variant="primary") - cancellation_btn = gr.Button("Cancel", variant="secondary") - tech_details_tog = gr.Markdown( - tech_details_string, - elem_classes=["secondary-text"], - ) - page_range.select(on_select_page, page_range, page_input) - service.select( - on_select_service, - service, - envs, - ) - file_type.select( - on_select_filetype, - file_type, - [file_input, link_input], - js=( - f""" - (a,b)=>{{ - try{{ - grecaptcha.render('recaptcha-box',{{ - 'sitekey':'{client_key}', - 'callback':'onVerify' - }}); - }}catch(error){{}} - return [a]; - }} - """ - if flag_demo - else "" - ), - ) - - with gr.Column(scale=2): - gr.Markdown("## Preview") - preview = PDF(label="Document Preview", visible=True, height=2000) - - # Event handlers - file_input.upload( - lambda x: x, - inputs=file_input, - outputs=preview, - js=( - f""" - (a,b)=>{{ - try{{ - grecaptcha.render('recaptcha-box',{{ - 'sitekey':'{client_key}', - 'callback':'onVerify' - }}); - }}catch(error){{}} - return [a]; - }} - """ - if flag_demo - else "" - ), - ) - - state = gr.State({"session_id": None}) - - translate_btn.click( - translate_file, - inputs=[ - file_type, - file_input, - link_input, - service, - lang_from, - lang_to, - page_range, - page_input, - prompt, - threads, - recaptcha_response, - state, - *envs, - ], - outputs=[ - output_file_mono, - preview, - output_file_dual, - output_file_mono, - output_file_dual, - output_title, - ], - ).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "") - - cancellation_btn.click( - stop_translate_file, - inputs=[state], - ) - - -def parse_user_passwd(file_path: str) -> tuple: - """ - Parse the user name and password from the file. - - Inputs: - - file_path: The file path to read. - Outputs: - - tuple_list: The list of tuples of user name and password. - - content: The content of the file - """ - tuple_list = [] - content = "" - if not file_path: - return tuple_list, content - if len(file_path) == 2: - try: - with open(file_path[1], "r", encoding="utf-8") as file: - content = file.read() - except FileNotFoundError: - print(f"Error: File '{file_path[1]}' not found.") - try: - with open(file_path[0], "r", encoding="utf-8") as file: - tuple_list = [ - tuple(line.strip().split(",")) for line in file if line.strip() - ] - except FileNotFoundError: - print(f"Error: File '{file_path[0]}' not found.") - return tuple_list, content - - -def setup_gui( - share: bool = False, auth_file: list = ["", ""], server_port=7860, electron=False -) -> None: - """ - Setup the GUI with the given parameters. - - Inputs: - - share: Whether to share the GUI. - - auth_file: The file path to read the user name and password. - - Outputs: - - None - """ - server_port = ConfigManager.get("gradio_port", server_port) - if electron: - try: - demo.launch( - server_name="0.0.0.0", - debug=True, - inbrowser=False, - share=share, - server_port=server_port, - ) - except Exception: - print( - "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software." - ) - try: - demo.launch( - server_name="127.0.0.1", - debug=True, - inbrowser=False, - share=share, - server_port=server_port, - ) - except Exception: - print( - "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software." - ) - demo.launch( - debug=True, - inbrowser=False, - share=True, - server_port=server_port, - ) - else: - user_list, html = parse_user_passwd(auth_file) - if flag_demo: - demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True) - else: - if len(user_list) == 0: - try: - demo.launch( - server_name="0.0.0.0", - debug=True, - inbrowser=True, - share=share, - server_port=server_port, - ) - except Exception: - print( - "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software." - ) - try: - demo.launch( - server_name="127.0.0.1", - debug=True, - inbrowser=True, - share=share, - server_port=server_port, - ) - except Exception: - print( - "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software." - ) - demo.launch( - debug=True, - inbrowser=True, - share=True, - server_port=server_port, - ) - else: - try: - demo.launch( - server_name="0.0.0.0", - debug=True, - inbrowser=True, - share=share, - auth=user_list, - auth_message=html, - server_port=server_port, - ) - except Exception: - print( - "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software." - ) - try: - demo.launch( - server_name="127.0.0.1", - debug=True, - inbrowser=True, - share=share, - auth=user_list, - auth_message=html, - server_port=server_port, - ) - except Exception: - print( - "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software." - ) - demo.launch( - debug=True, - inbrowser=True, - share=True, - auth=user_list, - auth_message=html, - server_port=server_port, - ) - - -# For auto-reloading while developing -if __name__ == "__main__": - setup_gui() +import asyncio +import cgi +import os +import shutil +import uuid +from asyncio import CancelledError +from pathlib import Path + +import gradio as gr +import requests +import tqdm +from gradio_pdf import PDF +from string import Template + +from pdf2zh import __version__ +from pdf2zh.high_level import translate +from pdf2zh.doclayout import ModelInstance +from pdf2zh.config import ConfigManager +from pdf2zh.translator import ( + AnythingLLMTranslator, + AzureOpenAITranslator, + AzureTranslator, + BaseTranslator, + BingTranslator, + DeepLTranslator, + DeepLXTranslator, + DifyTranslator, + ArgosTranslator, + GeminiTranslator, + GoogleTranslator, + ModelScopeTranslator, + OllamaTranslator, + OpenAITranslator, + SiliconTranslator, + TencentTranslator, + XinferenceTranslator, + ZhipuTranslator, + GorkTranslator, + GroqTranslator, + DeepseekTranslator, + OpenAIlikedTranslator, +) + +# The following variables associate strings with translators +service_map: dict[str, BaseTranslator] = { + "Google": GoogleTranslator, + "Bing": BingTranslator, + "DeepL": DeepLTranslator, + "DeepLX": DeepLXTranslator, + "Ollama": OllamaTranslator, + "Xinference": XinferenceTranslator, + "AzureOpenAI": AzureOpenAITranslator, + "OpenAI": OpenAITranslator, + "Zhipu": ZhipuTranslator, + "ModelScope": ModelScopeTranslator, + "Silicon": SiliconTranslator, + "Gemini": GeminiTranslator, + "Azure": AzureTranslator, + "Tencent": TencentTranslator, + "Dify": DifyTranslator, + "AnythingLLM": AnythingLLMTranslator, + "Argos Translate": ArgosTranslator, + "Gork": GorkTranslator, + "Groq": GroqTranslator, + "DeepSeek": DeepseekTranslator, + "OpenAI-liked": OpenAIlikedTranslator, +} + +# The following variables associate strings with specific languages +lang_map = { + "Simplified Chinese": "zh", + "Traditional Chinese": "zh-TW", + "English": "en", + "French": "fr", + "German": "de", + "Japanese": "ja", + "Korean": "ko", + "Russian": "ru", + "Spanish": "es", + "Italian": "it", +} + +# The following variable associate strings with page ranges +page_map = { + "All": None, + "First": [0], + "First 5 pages": list(range(0, 5)), + "Others": None, +} + +# Check if this is a public demo, which has resource limits +flag_demo = False + +# Limit resources +if ConfigManager.get("PDF2ZH_DEMO"): + flag_demo = True + service_map = { + "Google": GoogleTranslator, + } + page_map = { + "First": [0], + "First 20 pages": list(range(0, 20)), + } + client_key = ConfigManager.get("PDF2ZH_CLIENT_KEY") + server_key = ConfigManager.get("PDF2ZH_SERVER_KEY") + + +# Public demo control +def verify_recaptcha(response): + """ + This function verifies the reCAPTCHA response. + """ + recaptcha_url = "https://www.google.com/recaptcha/api/siteverify" + print("reCAPTCHA", server_key, response) + data = {"secret": server_key, "response": response} + result = requests.post(recaptcha_url, data=data).json() + print("reCAPTCHA", result.get("success")) + return result.get("success") + + +def download_with_limit(url: str, save_path: str, size_limit: int) -> str: + """ + This function downloads a file from a URL and saves it to a specified path. + + Inputs: + - url: The URL to download the file from + - save_path: The path to save the file to + - size_limit: The maximum size of the file to download + + Returns: + - The path of the downloaded file + """ + chunk_size = 1024 + total_size = 0 + with requests.get(url, stream=True, timeout=10) as response: + response.raise_for_status() + content = response.headers.get("Content-Disposition") + try: # filename from header + _, params = cgi.parse_header(content) + filename = params["filename"] + except Exception: # filename from url + filename = os.path.basename(url) + with open(save_path / filename, "wb") as file: + for chunk in response.iter_content(chunk_size=chunk_size): + total_size += len(chunk) + if size_limit and total_size > size_limit: + raise gr.Error("Exceeds file size limit") + file.write(chunk) + return save_path / filename + + +def stop_translate_file(state: dict) -> None: + """ + This function stops the translation process. + + Inputs: + - state: The state of the translation process + + Returns:- None + """ + session_id = state["session_id"] + if session_id is None: + return + if session_id in cancellation_event_map: + cancellation_event_map[session_id].set() + + +def translate_file( + file_type, + file_input, + link_input, + service, + lang_from, + lang_to, + page_range, + page_input, + prompt, + threads, + recaptcha_response, + state, + progress=gr.Progress(), + *envs, +): + """ + This function translates a PDF file from one language to another. + + Inputs: + - file_type: The type of file to translate + - file_input: The file to translate + - link_input: The link to the file to translate + - service: The translation service to use + - lang_from: The language to translate from + - lang_to: The language to translate to + - page_range: The range of pages to translate + - page_input: The input for the page range + - prompt: The custom prompt for the llm + - threads: The number of threads to use + - recaptcha_response: The reCAPTCHA response + - state: The state of the translation process + - progress: The progress bar + - envs: The environment variables + + Returns: + - The translated file + - The translated file + - The translated file + - The progress bar + - The progress bar + - The progress bar + """ + session_id = uuid.uuid4() + state["session_id"] = session_id + cancellation_event_map[session_id] = asyncio.Event() + # Translate PDF content using selected service. + if flag_demo and not verify_recaptcha(recaptcha_response): + raise gr.Error("reCAPTCHA fail") + + progress(0, desc="Starting translation...") + + output = Path("pdf2zh_files") + output.mkdir(parents=True, exist_ok=True) + + if file_type == "File": + if not file_input: + raise gr.Error("No input") + file_path = shutil.copy(file_input, output) + else: + if not link_input: + raise gr.Error("No input") + file_path = download_with_limit( + link_input, + output, + 5 * 1024 * 1024 if flag_demo else None, + ) + + filename = os.path.splitext(os.path.basename(file_path))[0] + file_raw = output / f"{filename}.pdf" + file_mono = output / f"{filename}-mono.pdf" + file_dual = output / f"{filename}-dual.pdf" + + translator = service_map[service] + if page_range != "Others": + selected_page = page_map[page_range] + else: + selected_page = [] + for p in page_input.split(","): + if "-" in p: + start, end = p.split("-") + selected_page.extend(range(int(start) - 1, int(end))) + else: + selected_page.append(int(p) - 1) + lang_from = lang_map[lang_from] + lang_to = lang_map[lang_to] + + _envs = {} + for i, env in enumerate(translator.envs.items()): + _envs[env[0]] = envs[i] + + print(f"Files before translation: {os.listdir(output)}") + + def progress_bar(t: tqdm.tqdm): + progress(t.n / t.total, desc="Translating...") + + try: + threads = int(threads) + except ValueError: + threads = 1 + + param = { + "files": [str(file_raw)], + "pages": selected_page, + "lang_in": lang_from, + "lang_out": lang_to, + "service": f"{translator.name}", + "output": output, + "thread": int(threads), + "callback": progress_bar, + "cancellation_event": cancellation_event_map[session_id], + "envs": _envs, + "prompt": Template(prompt), + "model": ModelInstance.value, + } + try: + translate(**param) + except CancelledError: + del cancellation_event_map[session_id] + raise gr.Error("Translation cancelled") + print(f"Files after translation: {os.listdir(output)}") + + if not file_mono.exists() or not file_dual.exists(): + raise gr.Error("No output") + + progress(1.0, desc="Translation complete!") + + return ( + str(file_mono), + str(file_mono), + str(file_dual), + gr.update(visible=True), + gr.update(visible=True), + gr.update(visible=True), + ) + + +# Global setup +custom_blue = gr.themes.Color( + c50="#E8F3FF", + c100="#BEDAFF", + c200="#94BFFF", + c300="#6AA1FF", + c400="#4080FF", + c500="#165DFF", # Primary color + c600="#0E42D2", + c700="#0A2BA6", + c800="#061D79", + c900="#03114D", + c950="#020B33", +) + +custom_css = """ + .secondary-text {color: #999 !important;} + footer {visibility: hidden} + .env-warning {color: #dd5500 !important;} + .env-success {color: #559900 !important;} + + /* Add dashed border to input-file class */ + .input-file { + border: 1.2px dashed #165DFF !important; + border-radius: 6px !important; + } + + .progress-bar-wrap { + border-radius: 8px !important; + } + + .progress-bar { + border-radius: 8px !important; + } + + .pdf-canvas canvas { + width: 100%; + } + """ + +demo_recaptcha = """ + + + """ + +tech_details_string = f""" + Technical details + - GitHub: Byaidu/PDFMathTranslate
+ - GUI by: Rongxin
+ - Version: {__version__} + """ +cancellation_event_map = {} + + +# The following code creates the GUI +with gr.Blocks( + title="PDFMathTranslate - PDF Translation with preserved formats", + theme=gr.themes.Default( + primary_hue=custom_blue, spacing_size="md", radius_size="lg" + ), + css=custom_css, + head=demo_recaptcha if flag_demo else "", +) as demo: + gr.Markdown( + "# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)" + ) + + with gr.Row(): + with gr.Column(scale=1): + gr.Markdown("## File | < 5 MB" if flag_demo else "## File") + file_type = gr.Radio( + choices=["File", "Link"], + label="Type", + value="File", + ) + file_input = gr.File( + label="File", + file_count="single", + file_types=[".pdf"], + type="filepath", + elem_classes=["input-file"], + ) + link_input = gr.Textbox( + label="Link", + visible=False, + interactive=True, + ) + gr.Markdown("## Option") + service = gr.Dropdown( + label="Service", + choices=service_map.keys(), + value="Google", + ) + envs = [] + for i in range(3): + envs.append( + gr.Textbox( + visible=False, + interactive=True, + ) + ) + with gr.Row(): + lang_from = gr.Dropdown( + label="Translate from", + choices=lang_map.keys(), + value=ConfigManager.get("PDF2ZH_LANG_FROM", "English"), + ) + lang_to = gr.Dropdown( + label="Translate to", + choices=lang_map.keys(), + value=ConfigManager.get("PDF2ZH_LANG_TO", "Simplified Chinese"), + ) + page_range = gr.Radio( + choices=page_map.keys(), + label="Pages", + value=list(page_map.keys())[0], + ) + + page_input = gr.Textbox( + label="Page range", + visible=False, + interactive=True, + ) + + with gr.Accordion("Open for More Experimental Options!", open=False): + gr.Markdown("#### Experimental") + threads = gr.Textbox( + label="number of threads", interactive=True, value="1" + ) + prompt = gr.Textbox( + label="Custom Prompt for llm", interactive=True, visible=False + ) + envs.append(prompt) + + def on_select_service(service, evt: gr.EventData): + translator = service_map[service] + _envs = [] + for i in range(4): + _envs.append(gr.update(visible=False, value="")) + for i, env in enumerate(translator.envs.items()): + _envs[i] = gr.update( + visible=True, + label=env[0], + value=ConfigManager.get_env_by_translatername( + translator, env[0], env[1] + ), + ) + _envs[-1] = gr.update(visible=translator.CustomPrompt) + return _envs + + def on_select_filetype(file_type): + return ( + gr.update(visible=file_type == "File"), + gr.update(visible=file_type == "Link"), + ) + + def on_select_page(choice): + if choice == "Others": + return gr.update(visible=True) + else: + return gr.update(visible=False) + + output_title = gr.Markdown("## Translated", visible=False) + output_file_mono = gr.File( + label="Download Translation (Mono)", visible=False + ) + output_file_dual = gr.File( + label="Download Translation (Dual)", visible=False + ) + recaptcha_response = gr.Textbox( + label="reCAPTCHA Response", elem_id="verify", visible=False + ) + recaptcha_box = gr.HTML('
') + translate_btn = gr.Button("Translate", variant="primary") + cancellation_btn = gr.Button("Cancel", variant="secondary") + tech_details_tog = gr.Markdown( + tech_details_string, + elem_classes=["secondary-text"], + ) + page_range.select(on_select_page, page_range, page_input) + service.select( + on_select_service, + service, + envs, + ) + file_type.select( + on_select_filetype, + file_type, + [file_input, link_input], + js=( + f""" + (a,b)=>{{ + try{{ + grecaptcha.render('recaptcha-box',{{ + 'sitekey':'{client_key}', + 'callback':'onVerify' + }}); + }}catch(error){{}} + return [a]; + }} + """ + if flag_demo + else "" + ), + ) + + with gr.Column(scale=2): + gr.Markdown("## Preview") + preview = PDF(label="Document Preview", visible=True, height=2000) + + # Event handlers + file_input.upload( + lambda x: x, + inputs=file_input, + outputs=preview, + js=( + f""" + (a,b)=>{{ + try{{ + grecaptcha.render('recaptcha-box',{{ + 'sitekey':'{client_key}', + 'callback':'onVerify' + }}); + }}catch(error){{}} + return [a]; + }} + """ + if flag_demo + else "" + ), + ) + + state = gr.State({"session_id": None}) + + translate_btn.click( + translate_file, + inputs=[ + file_type, + file_input, + link_input, + service, + lang_from, + lang_to, + page_range, + page_input, + prompt, + threads, + recaptcha_response, + state, + *envs, + ], + outputs=[ + output_file_mono, + preview, + output_file_dual, + output_file_mono, + output_file_dual, + output_title, + ], + ).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "") + + cancellation_btn.click( + stop_translate_file, + inputs=[state], + ) + + +def parse_user_passwd(file_path: str) -> tuple: + """ + Parse the user name and password from the file. + + Inputs: + - file_path: The file path to read. + Outputs: + - tuple_list: The list of tuples of user name and password. + - content: The content of the file + """ + tuple_list = [] + content = "" + if not file_path: + return tuple_list, content + if len(file_path) == 2: + try: + with open(file_path[1], "r", encoding="utf-8") as file: + content = file.read() + except FileNotFoundError: + print(f"Error: File '{file_path[1]}' not found.") + try: + with open(file_path[0], "r", encoding="utf-8") as file: + tuple_list = [ + tuple(line.strip().split(",")) for line in file if line.strip() + ] + except FileNotFoundError: + print(f"Error: File '{file_path[0]}' not found.") + return tuple_list, content + + +def setup_gui( + share: bool = False, auth_file: list = ["", ""], server_port=7860 +) -> None: + """ + Setup the GUI with the given parameters. + + Inputs: + - share: Whether to share the GUI. + - auth_file: The file path to read the user name and password. + + Outputs: + - None + """ + user_list, html = parse_user_passwd(auth_file) + if flag_demo: + demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True) + else: + if len(user_list) == 0: + try: + demo.launch( + server_name="0.0.0.0", + debug=True, + inbrowser=True, + share=share, + server_port=server_port, + ) + except Exception: + print( + "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software." + ) + try: + demo.launch( + server_name="127.0.0.1", + debug=True, + inbrowser=True, + share=share, + server_port=server_port, + ) + except Exception: + print( + "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software." + ) + demo.launch( + debug=True, inbrowser=True, share=True, server_port=server_port + ) + else: + try: + demo.launch( + server_name="0.0.0.0", + debug=True, + inbrowser=True, + share=share, + auth=user_list, + auth_message=html, + server_port=server_port, + ) + except Exception: + print( + "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software." + ) + try: + demo.launch( + server_name="127.0.0.1", + debug=True, + inbrowser=True, + share=share, + auth=user_list, + auth_message=html, + server_port=server_port, + ) + except Exception: + print( + "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software." + ) + demo.launch( + debug=True, + inbrowser=True, + share=True, + auth=user_list, + auth_message=html, + server_port=server_port, + ) + + +# For auto-reloading while developing +if __name__ == "__main__": + setup_gui() diff --git a/pdf2zh/high_level.py b/pdf2zh/high_level.py index 31125d09..9c5cee8e 100644 --- a/pdf2zh/high_level.py +++ b/pdf2zh/high_level.py @@ -1,397 +1,397 @@ -"""Functions that can be used for the most common use-cases for pdf2zh.six""" - -import asyncio -import io -import os -import sys -import tempfile -import urllib.request -from asyncio import CancelledError -from pathlib import Path -from typing import Any, BinaryIO, List, Optional, Dict - -import numpy as np -import requests -import tqdm -from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfexceptions import PDFValueError -from pdfminer.pdfinterp import PDFResourceManager -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfparser import PDFParser -from pymupdf import Document, Font - -from pdf2zh.converter import TranslateConverter -from pdf2zh.doclayout import OnnxModel -from pdf2zh.pdfinterp import PDFPageInterpreterEx - -from pdf2zh.config import ConfigManager - -NOTO_NAME = "noto" - -noto_list = [ - "am", # Amharic - "ar", # Arabic - "bn", # Bengali - "bg", # Bulgarian - "chr", # Cherokee - "el", # Greek - "gu", # Gujarati - "iw", # Hebrew - "hi", # Hindi - "kn", # Kannada - "ml", # Malayalam - "mr", # Marathi - "ru", # Russian - "sr", # Serbian - "ta", # Tamil - "te", # Telugu - "th", # Thai - "ur", # Urdu - "uk", # Ukrainian -] - - -def check_files(files: List[str]) -> List[str]: - files = [ - f for f in files if not f.startswith("http://") - ] # exclude online files, http - files = [ - f for f in files if not f.startswith("https://") - ] # exclude online files, https - missing_files = [file for file in files if not os.path.exists(file)] - return missing_files - - -def translate_patch( - inf: BinaryIO, - pages: Optional[list[int]] = None, - vfont: str = "", - vchar: str = "", - thread: int = 0, - doc_zh: Document = None, - lang_in: str = "", - lang_out: str = "", - service: str = "", - noto_name: str = "", - noto: Font = None, - callback: object = None, - cancellation_event: asyncio.Event = None, - model: OnnxModel = None, - envs: Dict = None, - prompt: List = None, - **kwarg: Any, -) -> None: - rsrcmgr = PDFResourceManager() - layout = {} - device = TranslateConverter( - rsrcmgr, - vfont, - vchar, - thread, - layout, - lang_in, - lang_out, - service, - noto_name, - noto, - envs, - prompt, - ) - - assert device is not None - obj_patch = {} - interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch) - if pages: - total_pages = len(pages) - else: - total_pages = doc_zh.page_count - - parser = PDFParser(inf) - doc = PDFDocument(parser) - with tqdm.tqdm(total=total_pages) as progress: - for pageno, page in enumerate(PDFPage.create_pages(doc)): - if cancellation_event and cancellation_event.is_set(): - raise CancelledError("task cancelled") - if pages and (pageno not in pages): - continue - progress.update() - if callback: - callback(progress) - page.pageno = pageno - pix = doc_zh[page.pageno].get_pixmap() - image = np.fromstring(pix.samples, np.uint8).reshape( - pix.height, pix.width, 3 - )[:, :, ::-1] - page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0] - # kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间 - box = np.ones((pix.height, pix.width)) - h, w = box.shape - vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"] - for i, d in enumerate(page_layout.boxes): - if page_layout.names[int(d.cls)] not in vcls: - x0, y0, x1, y1 = d.xyxy.squeeze() - x0, y0, x1, y1 = ( - np.clip(int(x0 - 1), 0, w - 1), - np.clip(int(h - y1 - 1), 0, h - 1), - np.clip(int(x1 + 1), 0, w - 1), - np.clip(int(h - y0 + 1), 0, h - 1), - ) - box[y0:y1, x0:x1] = i + 2 - for i, d in enumerate(page_layout.boxes): - if page_layout.names[int(d.cls)] in vcls: - x0, y0, x1, y1 = d.xyxy.squeeze() - x0, y0, x1, y1 = ( - np.clip(int(x0 - 1), 0, w - 1), - np.clip(int(h - y1 - 1), 0, h - 1), - np.clip(int(x1 + 1), 0, w - 1), - np.clip(int(h - y0 + 1), 0, h - 1), - ) - box[y0:y1, x0:x1] = 0 - layout[page.pageno] = box - # 新建一个 xref 存放新指令流 - page.page_xref = doc_zh.get_new_xref() # hack 插入页面的新 xref - doc_zh.update_object(page.page_xref, "<<>>") - doc_zh.update_stream(page.page_xref, b"") - doc_zh[page.pageno].set_contents(page.page_xref) - interpreter.process_page(page) - - device.close() - return obj_patch - - -def translate_stream( - stream: bytes, - pages: Optional[list[int]] = None, - lang_in: str = "", - lang_out: str = "", - service: str = "", - thread: int = 0, - vfont: str = "", - vchar: str = "", - callback: object = None, - cancellation_event: asyncio.Event = None, - model: OnnxModel = None, - envs: Dict = None, - prompt: List = None, - **kwarg: Any, -): - font_list = [("tiro", None)] - - font_path = download_remote_fonts(lang_out.lower()) - noto_name = NOTO_NAME - noto = Font(noto_name, font_path) - font_list.append((noto_name, font_path)) - - doc_en = Document(stream=stream) - stream = io.BytesIO() - doc_en.save(stream) - doc_zh = Document(stream=stream) - page_count = doc_zh.page_count - # font_list = [("GoNotoKurrent-Regular.ttf", font_path), ("tiro", None)] - font_id = {} - for page in doc_zh: - for font in font_list: - font_id[font[0]] = page.insert_font(font[0], font[1]) - xreflen = doc_zh.xref_length() - for xref in range(1, xreflen): - for label in ["Resources/", ""]: # 可能是基于 xobj 的 res - try: # xref 读写可能出错 - font_res = doc_zh.xref_get_key(xref, f"{label}Font") - if font_res[0] == "dict": - for font in font_list: - font_exist = doc_zh.xref_get_key(xref, f"{label}Font/{font[0]}") - if font_exist[0] == "null": - doc_zh.xref_set_key( - xref, - f"{label}Font/{font[0]}", - f"{font_id[font[0]]} 0 R", - ) - except Exception: - pass - - fp = io.BytesIO() - - doc_zh.save(fp) - obj_patch: dict = translate_patch(fp, **locals()) - - for obj_id, ops_new in obj_patch.items(): - # ops_old=doc_en.xref_stream(obj_id) - # print(obj_id) - # print(ops_old) - # print(ops_new.encode()) - doc_zh.update_stream(obj_id, ops_new.encode()) - - doc_en.insert_file(doc_zh) - for id in range(page_count): - doc_en.move_page(page_count + id, id * 2 + 1) - - doc_zh.subset_fonts(fallback=True) - doc_en.subset_fonts(fallback=True) - return ( - doc_zh.write(deflate=True, garbage=3, use_objstms=1), - doc_en.write(deflate=True, garbage=3, use_objstms=1), - ) - - -def convert_to_pdfa(input_path, output_path): - """ - Convert PDF to PDF/A format - - Args: - input_path: Path to source PDF file - output_path: Path to save PDF/A file - """ - from pikepdf import Dictionary, Name, Pdf - - # Open the PDF file - pdf = Pdf.open(input_path) - - # Add PDF/A conformance metadata - metadata = { - "pdfa_part": "2", - "pdfa_conformance": "B", - "title": pdf.docinfo.get("/Title", ""), - "author": pdf.docinfo.get("/Author", ""), - "creator": "PDF Math Translate", - } - - with pdf.open_metadata() as meta: - meta.load_from_docinfo(pdf.docinfo) - meta["pdfaid:part"] = metadata["pdfa_part"] - meta["pdfaid:conformance"] = metadata["pdfa_conformance"] - - # Create OutputIntent dictionary - output_intent = Dictionary( - { - "/Type": Name("/OutputIntent"), - "/S": Name("/GTS_PDFA1"), - "/OutputConditionIdentifier": "sRGB IEC61966-2.1", - "/RegistryName": "http://www.color.org", - "/Info": "sRGB IEC61966-2.1", - } - ) - - # Add output intent to PDF root - if "/OutputIntents" not in pdf.Root: - pdf.Root.OutputIntents = [output_intent] - else: - pdf.Root.OutputIntents.append(output_intent) - - # Save as PDF/A - pdf.save(output_path, linearize=True) - pdf.close() - - -def translate( - files: list[str], - output: str = "", - pages: Optional[list[int]] = None, - lang_in: str = "", - lang_out: str = "", - service: str = "", - thread: int = 0, - vfont: str = "", - vchar: str = "", - callback: object = None, - compatible: bool = False, - cancellation_event: asyncio.Event = None, - model: OnnxModel = None, - envs: Dict = None, - prompt: List = None, - **kwarg: Any, -): - if not files: - raise PDFValueError("No files to process.") - - missing_files = check_files(files) - - if missing_files: - print("The following files do not exist:", file=sys.stderr) - for file in missing_files: - print(f" {file}", file=sys.stderr) - raise PDFValueError("Some files do not exist.") - - result_files = [] - - for file in files: - if type(file) is str and ( - file.startswith("http://") or file.startswith("https://") - ): - print("Online files detected, downloading...") - try: - r = requests.get(file, allow_redirects=True) - if r.status_code == 200: - with tempfile.NamedTemporaryFile( - suffix=".pdf", delete=False - ) as tmp_file: - print(f"Writing the file: {file}...") - tmp_file.write(r.content) - file = tmp_file.name - else: - r.raise_for_status() - except Exception as e: - raise PDFValueError( - f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}" - ) - filename = os.path.splitext(os.path.basename(file))[0] - - # If the commandline has specified converting to PDF/A format - # --compatible / -cp - if compatible: - with tempfile.NamedTemporaryFile( - suffix="-pdfa.pdf", delete=False - ) as tmp_pdfa: - print(f"Converting {file} to PDF/A format...") - convert_to_pdfa(file, tmp_pdfa.name) - doc_raw = open(tmp_pdfa.name, "rb") - os.unlink(tmp_pdfa.name) - else: - doc_raw = open(file, "rb") - s_raw = doc_raw.read() - doc_raw.close() - - if file.startswith(tempfile.gettempdir()): - os.unlink(file) - s_mono, s_dual = translate_stream( - s_raw, - **locals(), - ) - file_mono = Path(output) / f"{filename}-mono.pdf" - file_dual = Path(output) / f"{filename}-dual.pdf" - doc_mono = open(file_mono, "wb") - doc_dual = open(file_dual, "wb") - doc_mono.write(s_mono) - doc_dual.write(s_dual) - doc_mono.close() - doc_dual.close() - result_files.append((str(file_mono), str(file_dual))) - - return result_files - - -def download_remote_fonts(lang: str): - URL_PREFIX = "https://github.com/timelic/source-han-serif/releases/download/main/" - LANG_NAME_MAP = { - **{la: "GoNotoKurrent-Regular.ttf" for la in noto_list}, - **{ - la: f"SourceHanSerif{region}-Regular.ttf" - for region, langs in { - "CN": ["zh-cn", "zh-hans", "zh"], - "TW": ["zh-tw", "zh-hant"], - "JP": ["ja"], - "KR": ["ko"], - }.items() - for la in langs - }, - } - font_name = LANG_NAME_MAP.get(lang, "GoNotoKurrent-Regular.ttf") - - # docker - font_path = ConfigManager.get("NOTO_FONT_PATH", Path("/app", font_name).as_posix()) - if not Path(font_path).exists(): - font_path = Path(tempfile.gettempdir(), font_name).as_posix() - if not Path(font_path).exists(): - print(f"Downloading {font_name}...") - urllib.request.urlretrieve(f"{URL_PREFIX}{font_name}", font_path) - - return font_path +"""Functions that can be used for the most common use-cases for pdf2zh.six""" + +import asyncio +import io +import os +import sys +import tempfile +import urllib.request +from asyncio import CancelledError +from pathlib import Path +from typing import Any, BinaryIO, List, Optional, Dict + +import numpy as np +import requests +import tqdm +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfexceptions import PDFValueError +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfparser import PDFParser +from pymupdf import Document, Font + +from pdf2zh.converter import TranslateConverter +from pdf2zh.doclayout import OnnxModel +from pdf2zh.pdfinterp import PDFPageInterpreterEx + +from pdf2zh.config import ConfigManager + +NOTO_NAME = "noto" + +noto_list = [ + "am", # Amharic + "ar", # Arabic + "bn", # Bengali + "bg", # Bulgarian + "chr", # Cherokee + "el", # Greek + "gu", # Gujarati + "iw", # Hebrew + "hi", # Hindi + "kn", # Kannada + "ml", # Malayalam + "mr", # Marathi + "ru", # Russian + "sr", # Serbian + "ta", # Tamil + "te", # Telugu + "th", # Thai + "ur", # Urdu + "uk", # Ukrainian +] + + +def check_files(files: List[str]) -> List[str]: + files = [ + f for f in files if not f.startswith("http://") + ] # exclude online files, http + files = [ + f for f in files if not f.startswith("https://") + ] # exclude online files, https + missing_files = [file for file in files if not os.path.exists(file)] + return missing_files + + +def translate_patch( + inf: BinaryIO, + pages: Optional[list[int]] = None, + vfont: str = "", + vchar: str = "", + thread: int = 0, + doc_zh: Document = None, + lang_in: str = "", + lang_out: str = "", + service: str = "", + noto_name: str = "", + noto: Font = None, + callback: object = None, + cancellation_event: asyncio.Event = None, + model: OnnxModel = None, + envs: Dict = None, + prompt: List = None, + **kwarg: Any, +) -> None: + rsrcmgr = PDFResourceManager() + layout = {} + device = TranslateConverter( + rsrcmgr, + vfont, + vchar, + thread, + layout, + lang_in, + lang_out, + service, + noto_name, + noto, + envs, + prompt, + ) + + assert device is not None + obj_patch = {} + interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch) + if pages: + total_pages = len(pages) + else: + total_pages = doc_zh.page_count + + parser = PDFParser(inf) + doc = PDFDocument(parser) + with tqdm.tqdm(total=total_pages) as progress: + for pageno, page in enumerate(PDFPage.create_pages(doc)): + if cancellation_event and cancellation_event.is_set(): + raise CancelledError("task cancelled") + if pages and (pageno not in pages): + continue + progress.update() + if callback: + callback(progress) + page.pageno = pageno + pix = doc_zh[page.pageno].get_pixmap() + image = np.fromstring(pix.samples, np.uint8).reshape( + pix.height, pix.width, 3 + )[:, :, ::-1] + page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0] + # kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间 + box = np.ones((pix.height, pix.width)) + h, w = box.shape + vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"] + for i, d in enumerate(page_layout.boxes): + if page_layout.names[int(d.cls)] not in vcls: + x0, y0, x1, y1 = d.xyxy.squeeze() + x0, y0, x1, y1 = ( + np.clip(int(x0 - 1), 0, w - 1), + np.clip(int(h - y1 - 1), 0, h - 1), + np.clip(int(x1 + 1), 0, w - 1), + np.clip(int(h - y0 + 1), 0, h - 1), + ) + box[y0:y1, x0:x1] = i + 2 + for i, d in enumerate(page_layout.boxes): + if page_layout.names[int(d.cls)] in vcls: + x0, y0, x1, y1 = d.xyxy.squeeze() + x0, y0, x1, y1 = ( + np.clip(int(x0 - 1), 0, w - 1), + np.clip(int(h - y1 - 1), 0, h - 1), + np.clip(int(x1 + 1), 0, w - 1), + np.clip(int(h - y0 + 1), 0, h - 1), + ) + box[y0:y1, x0:x1] = 0 + layout[page.pageno] = box + # 新建一个 xref 存放新指令流 + page.page_xref = doc_zh.get_new_xref() # hack 插入页面的新 xref + doc_zh.update_object(page.page_xref, "<<>>") + doc_zh.update_stream(page.page_xref, b"") + doc_zh[page.pageno].set_contents(page.page_xref) + interpreter.process_page(page) + + device.close() + return obj_patch + + +def translate_stream( + stream: bytes, + pages: Optional[list[int]] = None, + lang_in: str = "", + lang_out: str = "", + service: str = "", + thread: int = 0, + vfont: str = "", + vchar: str = "", + callback: object = None, + cancellation_event: asyncio.Event = None, + model: OnnxModel = None, + envs: Dict = None, + prompt: List = None, + **kwarg: Any, +): + font_list = [("tiro", None)] + + font_path = download_remote_fonts(lang_out.lower()) + noto_name = NOTO_NAME + noto = Font(noto_name, font_path) + font_list.append((noto_name, font_path)) + + doc_en = Document(stream=stream) + stream = io.BytesIO() + doc_en.save(stream) + doc_zh = Document(stream=stream) + page_count = doc_zh.page_count + # font_list = [("GoNotoKurrent-Regular.ttf", font_path), ("tiro", None)] + font_id = {} + for page in doc_zh: + for font in font_list: + font_id[font[0]] = page.insert_font(font[0], font[1]) + xreflen = doc_zh.xref_length() + for xref in range(1, xreflen): + for label in ["Resources/", ""]: # 可能是基于 xobj 的 res + try: # xref 读写可能出错 + font_res = doc_zh.xref_get_key(xref, f"{label}Font") + if font_res[0] == "dict": + for font in font_list: + font_exist = doc_zh.xref_get_key(xref, f"{label}Font/{font[0]}") + if font_exist[0] == "null": + doc_zh.xref_set_key( + xref, + f"{label}Font/{font[0]}", + f"{font_id[font[0]]} 0 R", + ) + except Exception: + pass + + fp = io.BytesIO() + + doc_zh.save(fp) + obj_patch: dict = translate_patch(fp, **locals()) + + for obj_id, ops_new in obj_patch.items(): + # ops_old=doc_en.xref_stream(obj_id) + # print(obj_id) + # print(ops_old) + # print(ops_new.encode()) + doc_zh.update_stream(obj_id, ops_new.encode()) + + doc_en.insert_file(doc_zh) + for id in range(page_count): + doc_en.move_page(page_count + id, id * 2 + 1) + + doc_zh.subset_fonts(fallback=True) + doc_en.subset_fonts(fallback=True) + return ( + doc_zh.write(deflate=True, garbage=3, use_objstms=1), + doc_en.write(deflate=True, garbage=3, use_objstms=1), + ) + + +def convert_to_pdfa(input_path, output_path): + """ + Convert PDF to PDF/A format + + Args: + input_path: Path to source PDF file + output_path: Path to save PDF/A file + """ + from pikepdf import Dictionary, Name, Pdf + + # Open the PDF file + pdf = Pdf.open(input_path) + + # Add PDF/A conformance metadata + metadata = { + "pdfa_part": "2", + "pdfa_conformance": "B", + "title": pdf.docinfo.get("/Title", ""), + "author": pdf.docinfo.get("/Author", ""), + "creator": "PDF Math Translate", + } + + with pdf.open_metadata() as meta: + meta.load_from_docinfo(pdf.docinfo) + meta["pdfaid:part"] = metadata["pdfa_part"] + meta["pdfaid:conformance"] = metadata["pdfa_conformance"] + + # Create OutputIntent dictionary + output_intent = Dictionary( + { + "/Type": Name("/OutputIntent"), + "/S": Name("/GTS_PDFA1"), + "/OutputConditionIdentifier": "sRGB IEC61966-2.1", + "/RegistryName": "http://www.color.org", + "/Info": "sRGB IEC61966-2.1", + } + ) + + # Add output intent to PDF root + if "/OutputIntents" not in pdf.Root: + pdf.Root.OutputIntents = [output_intent] + else: + pdf.Root.OutputIntents.append(output_intent) + + # Save as PDF/A + pdf.save(output_path, linearize=True) + pdf.close() + + +def translate( + files: list[str], + output: str = "", + pages: Optional[list[int]] = None, + lang_in: str = "", + lang_out: str = "", + service: str = "", + thread: int = 0, + vfont: str = "", + vchar: str = "", + callback: object = None, + compatible: bool = False, + cancellation_event: asyncio.Event = None, + model: OnnxModel = None, + envs: Dict = None, + prompt: List = None, + **kwarg: Any, +): + if not files: + raise PDFValueError("No files to process.") + + missing_files = check_files(files) + + if missing_files: + print("The following files do not exist:", file=sys.stderr) + for file in missing_files: + print(f" {file}", file=sys.stderr) + raise PDFValueError("Some files do not exist.") + + result_files = [] + + for file in files: + if type(file) is str and ( + file.startswith("http://") or file.startswith("https://") + ): + print("Online files detected, downloading...") + try: + r = requests.get(file, allow_redirects=True) + if r.status_code == 200: + with tempfile.NamedTemporaryFile( + suffix=".pdf", delete=False + ) as tmp_file: + print(f"Writing the file: {file}...") + tmp_file.write(r.content) + file = tmp_file.name + else: + r.raise_for_status() + except Exception as e: + raise PDFValueError( + f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}" + ) + filename = os.path.splitext(os.path.basename(file))[0] + + # If the commandline has specified converting to PDF/A format + # --compatible / -cp + if compatible: + with tempfile.NamedTemporaryFile( + suffix="-pdfa.pdf", delete=False + ) as tmp_pdfa: + print(f"Converting {file} to PDF/A format...") + convert_to_pdfa(file, tmp_pdfa.name) + doc_raw = open(tmp_pdfa.name, "rb") + os.unlink(tmp_pdfa.name) + else: + doc_raw = open(file, "rb") + s_raw = doc_raw.read() + doc_raw.close() + + if file.startswith(tempfile.gettempdir()): + os.unlink(file) + s_mono, s_dual = translate_stream( + s_raw, + **locals(), + ) + file_mono = Path(output) / f"{filename}-mono.pdf" + file_dual = Path(output) / f"{filename}-dual.pdf" + doc_mono = open(file_mono, "wb") + doc_dual = open(file_dual, "wb") + doc_mono.write(s_mono) + doc_dual.write(s_dual) + doc_mono.close() + doc_dual.close() + result_files.append((str(file_mono), str(file_dual))) + + return result_files + + +def download_remote_fonts(lang: str): + URL_PREFIX = "https://github.com/timelic/source-han-serif/releases/download/main/" + LANG_NAME_MAP = { + **{la: "GoNotoKurrent-Regular.ttf" for la in noto_list}, + **{ + la: f"SourceHanSerif{region}-Regular.ttf" + for region, langs in { + "CN": ["zh-cn", "zh-hans", "zh"], + "TW": ["zh-tw", "zh-hant"], + "JP": ["ja"], + "KR": ["ko"], + }.items() + for la in langs + }, + } + font_name = LANG_NAME_MAP.get(lang, "GoNotoKurrent-Regular.ttf") + + # docker + font_path = ConfigManager.get("NOTO_FONT_PATH", Path("/app", font_name).as_posix()) + if not Path(font_path).exists(): + font_path = Path(tempfile.gettempdir(), font_name).as_posix() + if not Path(font_path).exists(): + print(f"Downloading {font_name}...") + urllib.request.urlretrieve(f"{URL_PREFIX}{font_name}", font_path) + + return font_path diff --git a/pdf2zh/pdf2zh.py b/pdf2zh/pdf2zh.py index 36cf8704..49c76dd4 100644 --- a/pdf2zh/pdf2zh.py +++ b/pdf2zh/pdf2zh.py @@ -1,315 +1,269 @@ -#!/usr/bin/env python3 -"""A command line tool for extracting text and images from PDF and -output it to plain text, html, xml or tags. -""" - -from __future__ import annotations - -import argparse -import logging -import sys -from string import Template -from typing import List, Optional - -from pdf2zh import __version__, log -from pdf2zh.high_level import translate -from pdf2zh.doclayout import OnnxModel, ModelInstance -import os - -from pdf2zh.config import ConfigManager - - -def create_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description=__doc__, add_help=True) - parser.add_argument( - "files", - type=str, - default=None, - nargs="*", - help="One or more paths to PDF files.", - ) - parser.add_argument( - "--version", - "-v", - action="version", - version=f"pdf2zh v{__version__}", - ) - parser.add_argument( - "--debug", - "-d", - default=False, - action="store_true", - help="Use debug logging level.", - ) - parse_params = parser.add_argument_group( - "Parser", - description="Used during PDF parsing", - ) - parse_params.add_argument( - "--pages", - "-p", - type=str, - help="The list of page numbers to parse.", - ) - parse_params.add_argument( - "--vfont", - "-f", - type=str, - default="", - help="The regex to math font name of formula.", - ) - parse_params.add_argument( - "--vchar", - "-c", - type=str, - default="", - help="The regex to math character of formula.", - ) - parse_params.add_argument( - "--lang-in", - "-li", - type=str, - default="en", - help="The code of source language.", - ) - parse_params.add_argument( - "--lang-out", - "-lo", - type=str, - default="zh", - help="The code of target language.", - ) - parse_params.add_argument( - "--service", - "-s", - type=str, - default="google", - help="The service to use for translation.", - ) - parse_params.add_argument( - "--output", - "-o", - type=str, - default="", - help="Output directory for files.", - ) - parse_params.add_argument( - "--thread", - "-t", - type=int, - default=4, - help="The number of threads to execute translation.", - ) - parse_params.add_argument( - "--interactive", - "-i", - action="store_true", - help="Interact with GUI.", - ) - parse_params.add_argument( - "--share", - action="store_true", - help="Enable Gradio Share", - ) - parse_params.add_argument( - "--flask", - action="store_true", - help="flask", - ) - parse_params.add_argument( - "--celery", - action="store_true", - help="celery", - ) - parse_params.add_argument( - "--authorized", - type=str, - nargs="+", - help="user name and password.", - ) - parse_params.add_argument( - "--prompt", - type=str, - help="user custom prompt.", - ) - - parse_params.add_argument( - "--compatible", - "-cp", - action="store_true", - help="Convert the PDF file into PDF/A format to improve compatibility.", - ) - - parse_params.add_argument( - "--onnx", - type=str, - help="custom onnx model path.", - ) - - parse_params.add_argument( - "--serverport", - type=int, - help="custom WebUI port.", - ) - - parse_params.add_argument( - "--dir", - action="store_true", - help="translate directory.", - ) - - parse_params.add_argument( - "--config", - type=str, - help="config file.", - ) - - parse_params.add_argument( - "--electron", - action="store_true", - ) - - parse_params.add_argument( - "--dry_run", - action="store_true", - ) - - return parser - - -def parse_args(args: Optional[List[str]]) -> argparse.Namespace: - parsed_args = create_parser().parse_args(args=args) - - if parsed_args.pages: - pages = [] - for p in parsed_args.pages.split(","): - if "-" in p: - start, end = p.split("-") - pages.extend(range(int(start) - 1, int(end))) - else: - pages.append(int(p) - 1) - parsed_args.pages = pages - - return parsed_args - - -def find_all_files_in_directory(directory_path): - """ - Recursively search all PDF files in the given directory and return their paths as a list. - - :param directory_path: str, the path to the directory to search - :return: list of PDF file paths - """ - # Check if the provided path is a directory - if not os.path.isdir(directory_path): - raise ValueError(f"The provided path '{directory_path}' is not a directory.") - - file_paths = [] - - # Walk through the directory recursively - for root, _, files in os.walk(directory_path): - for file in files: - # Check if the file is a PDF - if file.lower().endswith(".pdf"): - # Append the full file path to the list - file_paths.append(os.path.join(root, file)) - - return file_paths - - -def dry_run(): - # 定义文件路径 - base_dir = os.getcwd() # 获取当前路径 - config_dir = os.path.join(base_dir, "userdata") - config_file = os.path.join(config_dir, "config.json") - - # 确保目录存在 - if not os.path.exists(config_dir): - os.makedirs(config_dir) # 创建目录 - ConfigManager.custome_config(config_file) - - # 检查配置文件是否存在 - if not os.path.exists(config_file): - ConfigManager.get("gradio_port", 12366) - pdf2zh_path = os.path.join(base_dir, "pdf2zh_dist", "Scripts", "pdf2zh.exe") - if os.path.exists(pdf2zh_path): - ConfigManager.get("pdf2zh_path", pdf2zh_path) - else: - import shutil - - pdf2zh_path = shutil.which("pdf2zh") - if pdf2zh_path: - ConfigManager.get("pdf2zh_path", pdf2zh_path) - else: - raise ValueError("pdf2zh not found.") - - -def main(args: Optional[List[str]] = None) -> int: - logging.basicConfig() - - parsed_args = parse_args(args) - - if parsed_args.dry_run: - dry_run() - return 0 - - if parsed_args.config: - ConfigManager.custome_config(parsed_args.config) - - if parsed_args.debug: - log.setLevel(logging.DEBUG) - - if parsed_args.onnx: - ModelInstance.value = OnnxModel(parsed_args.onnx) - else: - ModelInstance.value = OnnxModel.load_available() - - if parsed_args.interactive: - from pdf2zh.gui import setup_gui - - if not parsed_args.electron: - if parsed_args.serverport: - setup_gui( - parsed_args.share, - parsed_args.authorized, - int(parsed_args.serverport), - ) - else: - setup_gui(parsed_args.share, parsed_args.authorized) - else: - setup_gui(parsed_args.share, parsed_args.authorized, electron=True) - return 0 - - if parsed_args.flask: - from pdf2zh.backend import flask_app - - flask_app.run(port=11008) - return 0 - - if parsed_args.celery: - from pdf2zh.backend import celery_app - - celery_app.start(argv=sys.argv[2:]) - return 0 - - if parsed_args.prompt: - try: - with open(parsed_args.prompt, "r", encoding="utf-8") as file: - content = file.read() - parsed_args.prompt = Template(content) - except Exception: - raise ValueError("prompt error.") - - print(parsed_args) - if parsed_args.dir: - untranlate_file = find_all_files_in_directory(parsed_args.files[0]) - parsed_args.files = untranlate_file - translate(model=ModelInstance.value, **vars(parsed_args)) - return 0 - - translate(model=ModelInstance.value, **vars(parsed_args)) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) +#!/usr/bin/env python3 +"""A command line tool for extracting text and images from PDF and +output it to plain text, html, xml or tags. +""" + +from __future__ import annotations + +import argparse +import logging +import sys +from string import Template +from typing import List, Optional + +from pdf2zh import __version__, log +from pdf2zh.high_level import translate +from pdf2zh.doclayout import OnnxModel, ModelInstance +import os + +from pdf2zh.config import ConfigManager + + +def create_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__, add_help=True) + parser.add_argument( + "files", + type=str, + default=None, + nargs="*", + help="One or more paths to PDF files.", + ) + parser.add_argument( + "--version", + "-v", + action="version", + version=f"pdf2zh v{__version__}", + ) + parser.add_argument( + "--debug", + "-d", + default=False, + action="store_true", + help="Use debug logging level.", + ) + parse_params = parser.add_argument_group( + "Parser", + description="Used during PDF parsing", + ) + parse_params.add_argument( + "--pages", + "-p", + type=str, + help="The list of page numbers to parse.", + ) + parse_params.add_argument( + "--vfont", + "-f", + type=str, + default="", + help="The regex to math font name of formula.", + ) + parse_params.add_argument( + "--vchar", + "-c", + type=str, + default="", + help="The regex to math character of formula.", + ) + parse_params.add_argument( + "--lang-in", + "-li", + type=str, + default="en", + help="The code of source language.", + ) + parse_params.add_argument( + "--lang-out", + "-lo", + type=str, + default="zh", + help="The code of target language.", + ) + parse_params.add_argument( + "--service", + "-s", + type=str, + default="google", + help="The service to use for translation.", + ) + parse_params.add_argument( + "--output", + "-o", + type=str, + default="", + help="Output directory for files.", + ) + parse_params.add_argument( + "--thread", + "-t", + type=int, + default=4, + help="The number of threads to execute translation.", + ) + parse_params.add_argument( + "--interactive", + "-i", + action="store_true", + help="Interact with GUI.", + ) + parse_params.add_argument( + "--share", + action="store_true", + help="Enable Gradio Share", + ) + parse_params.add_argument( + "--flask", + action="store_true", + help="flask", + ) + parse_params.add_argument( + "--celery", + action="store_true", + help="celery", + ) + parse_params.add_argument( + "--authorized", + type=str, + nargs="+", + help="user name and password.", + ) + parse_params.add_argument( + "--prompt", + type=str, + help="user custom prompt.", + ) + + parse_params.add_argument( + "--compatible", + "-cp", + action="store_true", + help="Convert the PDF file into PDF/A format to improve compatibility.", + ) + + parse_params.add_argument( + "--onnx", + type=str, + help="custom onnx model path.", + ) + + parse_params.add_argument( + "--serverport", + type=int, + help="custom WebUI port.", + ) + + parse_params.add_argument( + "--dir", + action="store_true", + help="translate directory.", + ) + + parse_params.add_argument( + "--config", + type=str, + help="config file.", + ) + + return parser + + +def parse_args(args: Optional[List[str]]) -> argparse.Namespace: + parsed_args = create_parser().parse_args(args=args) + + if parsed_args.pages: + pages = [] + for p in parsed_args.pages.split(","): + if "-" in p: + start, end = p.split("-") + pages.extend(range(int(start) - 1, int(end))) + else: + pages.append(int(p) - 1) + parsed_args.pages = pages + + return parsed_args + + +def find_all_files_in_directory(directory_path): + """ + Recursively search all PDF files in the given directory and return their paths as a list. + + :param directory_path: str, the path to the directory to search + :return: list of PDF file paths + """ + # Check if the provided path is a directory + if not os.path.isdir(directory_path): + raise ValueError(f"The provided path '{directory_path}' is not a directory.") + + file_paths = [] + + # Walk through the directory recursively + for root, _, files in os.walk(directory_path): + for file in files: + # Check if the file is a PDF + if file.lower().endswith(".pdf"): + # Append the full file path to the list + file_paths.append(os.path.join(root, file)) + + return file_paths + + +def main(args: Optional[List[str]] = None) -> int: + logging.basicConfig() + + parsed_args = parse_args(args) + + if parsed_args.config: + ConfigManager.custome_config(parsed_args.config) + + if parsed_args.debug: + log.setLevel(logging.DEBUG) + + if parsed_args.onnx: + ModelInstance.value = OnnxModel(parsed_args.onnx) + else: + ModelInstance.value = OnnxModel.load_available() + + if parsed_args.interactive: + from pdf2zh.gui import setup_gui + + if parsed_args.serverport: + setup_gui( + parsed_args.share, parsed_args.authorized, int(parsed_args.serverport) + ) + else: + setup_gui(parsed_args.share, parsed_args.authorized) + return 0 + + if parsed_args.flask: + from pdf2zh.backend import flask_app + + flask_app.run(port=11008) + return 0 + + if parsed_args.celery: + from pdf2zh.backend import celery_app + + celery_app.start(argv=sys.argv[2:]) + return 0 + + if parsed_args.prompt: + try: + with open(parsed_args.prompt, "r", encoding="utf-8") as file: + content = file.read() + parsed_args.prompt = Template(content) + except Exception: + raise ValueError("prompt error.") + + print(parsed_args) + if parsed_args.dir: + untranlate_file = find_all_files_in_directory(parsed_args.files[0]) + parsed_args.files = untranlate_file + translate(model=ModelInstance.value, **vars(parsed_args)) + return 0 + + translate(model=ModelInstance.value, **vars(parsed_args)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/pdf2zh/pdfinterp.py b/pdf2zh/pdfinterp.py index 79db0c4a..62ff4ed3 100644 --- a/pdf2zh/pdfinterp.py +++ b/pdf2zh/pdfinterp.py @@ -1,364 +1,364 @@ -import logging -from typing import Any, Dict, Optional, Sequence, Tuple, cast -import numpy as np - -from pdfminer import settings -from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace -from pdfminer.pdfdevice import PDFDevice -from pdfminer.pdfinterp import ( - PDFPageInterpreter, - PDFResourceManager, - PDFContentParser, - PDFInterpreterError, - Color, - PDFStackT, - LITERAL_FORM, - LITERAL_IMAGE, -) -from pdfminer.pdffont import PDFFont -from pdfminer.pdfpage import PDFPage -from pdfminer.pdftypes import ( - PDFObjRef, - dict_value, - list_value, - resolve1, - stream_value, -) -from pdfminer.psexceptions import PSEOF -from pdfminer.psparser import ( - PSKeyword, - keyword_name, - literal_name, -) -from pdfminer.utils import ( - MATRIX_IDENTITY, - Matrix, - Rect, - mult_matrix, - apply_matrix_pt, -) - -log = logging.getLogger(__name__) - - -def safe_float(o: Any) -> Optional[float]: - try: - return float(o) - except (TypeError, ValueError): - return None - - -class PDFPageInterpreterEx(PDFPageInterpreter): - """Processor for the content of a PDF page - - Reference: PDF Reference, Appendix A, Operator Summary - """ - - def __init__( - self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch - ) -> None: - self.rsrcmgr = rsrcmgr - self.device = device - self.obj_patch = obj_patch - - def dup(self) -> "PDFPageInterpreterEx": - return self.__class__(self.rsrcmgr, self.device, self.obj_patch) - - def init_resources(self, resources: Dict[object, object]) -> None: - # 重载设置 fontid 和 descent - """Prepare the fonts and XObjects listed in the Resource attribute.""" - self.resources = resources - self.fontmap: Dict[object, PDFFont] = {} - self.fontid: Dict[PDFFont, object] = {} - self.xobjmap = {} - self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() - if not resources: - return - - def get_colorspace(spec: object) -> Optional[PDFColorSpace]: - if isinstance(spec, list): - name = literal_name(spec[0]) - else: - name = literal_name(spec) - if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2: - return PDFColorSpace(name, stream_value(spec[1])["N"]) - elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2: - return PDFColorSpace(name, len(list_value(spec[1]))) - else: - return PREDEFINED_COLORSPACE.get(name) - - for k, v in dict_value(resources).items(): - # log.debug("Resource: %r: %r", k, v) - if k == "Font": - for fontid, spec in dict_value(v).items(): - objid = None - if isinstance(spec, PDFObjRef): - objid = spec.objid - spec = dict_value(spec) - self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) - self.fontmap[fontid].descent = 0 # hack fix descent - self.fontid[self.fontmap[fontid]] = fontid - elif k == "ColorSpace": - for csid, spec in dict_value(v).items(): - colorspace = get_colorspace(resolve1(spec)) - if colorspace is not None: - self.csmap[csid] = colorspace - elif k == "ProcSet": - self.rsrcmgr.get_procset(list_value(v)) - elif k == "XObject": - for xobjid, xobjstrm in dict_value(v).items(): - self.xobjmap[xobjid] = xobjstrm - - def do_S(self) -> None: - # 重载过滤非公式线条 - """Stroke path""" - - def is_black(color: Color) -> bool: - if isinstance(color, Tuple): - return sum(color) == 0 - else: - return color == 0 - - if ( - len(self.curpath) == 2 - and self.curpath[0][0] == "m" - and self.curpath[1][0] == "l" - and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1] - == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1] - and is_black(self.graphicstate.scolor) - ): # 独立直线,水平,黑色 - # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor) - self.device.paint_path(self.graphicstate, True, False, False, self.curpath) - self.curpath = [] - return "n" - else: - self.curpath = [] - - ############################################################ - # 重载过滤非公式线条(F/B) - def do_f(self) -> None: - """Fill path using nonzero winding number rule""" - # self.device.paint_path(self.graphicstate, False, True, False, self.curpath) - self.curpath = [] - - def do_F(self) -> None: - """Fill path using nonzero winding number rule (obsolete)""" - - def do_f_a(self) -> None: - """Fill path using even-odd rule""" - # self.device.paint_path(self.graphicstate, False, True, True, self.curpath) - self.curpath = [] - - def do_B(self) -> None: - """Fill and stroke path using nonzero winding number rule""" - # self.device.paint_path(self.graphicstate, True, True, False, self.curpath) - self.curpath = [] - - def do_B_a(self) -> None: - """Fill and stroke path using even-odd rule""" - # self.device.paint_path(self.graphicstate, True, True, True, self.curpath) - self.curpath = [] - - ############################################################ - # 重载返回调用参数(SCN) - def do_SCN(self) -> None: - """Set color for stroking operations.""" - if self.scs: - n = self.scs.ncomponents - else: - if settings.STRICT: - raise PDFInterpreterError("No colorspace specified!") - n = 1 - args = self.pop(n) - self.graphicstate.scolor = cast(Color, args) - return args - - def do_scn(self) -> None: - """Set color for nonstroking operations""" - if self.ncs: - n = self.ncs.ncomponents - else: - if settings.STRICT: - raise PDFInterpreterError("No colorspace specified!") - n = 1 - args = self.pop(n) - self.graphicstate.ncolor = cast(Color, args) - return args - - def do_SC(self) -> None: - """Set color for stroking operations""" - return self.do_SCN() - - def do_sc(self) -> None: - """Set color for nonstroking operations""" - return self.do_scn() - - def do_Do(self, xobjid_arg: PDFStackT) -> None: - # 重载设置 xobj 的 obj_patch - """Invoke named XObject""" - xobjid = literal_name(xobjid_arg) - try: - xobj = stream_value(self.xobjmap[xobjid]) - except KeyError: - if settings.STRICT: - raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) - return - # log.debug("Processing xobj: %r", xobj) - subtype = xobj.get("Subtype") - if subtype is LITERAL_FORM and "BBox" in xobj: - interpreter = self.dup() - bbox = cast(Rect, list_value(xobj["BBox"])) - matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY))) - # According to PDF reference 1.7 section 4.9.1, XObjects in - # earlier PDFs (prior to v1.2) use the page's Resources entry - # instead of having their own Resources entry. - xobjres = xobj.get("Resources") - if xobjres: - resources = dict_value(xobjres) - else: - resources = self.resources.copy() - self.device.begin_figure(xobjid, bbox, matrix) - ctm = mult_matrix(matrix, self.ctm) - ops_base = interpreter.render_contents( - resources, - [xobj], - ctm=ctm, - ) - try: # 有的时候 form 字体加不上这里会烂掉 - self.device.fontid = interpreter.fontid - self.device.fontmap = interpreter.fontmap - ops_new = self.device.end_figure(xobjid) - ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2)) - np_version = np.__version__ - if np_version.split(".")[0] >= "2": - pos_inv = -np.asmatrix(ctm[4:]) * ctm_inv - else: - pos_inv = -np.mat(ctm[4:]) * ctm_inv - a, b, c, d = ctm_inv.reshape(4).tolist() - e, f = pos_inv.tolist()[0] - self.obj_patch[self.xobjmap[xobjid].objid] = ( - f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}" - ) - except Exception: - pass - elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: - self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) - self.device.render_image(xobjid, xobj) - self.device.end_figure(xobjid) - else: - # unsupported xobject type. - pass - - def process_page(self, page: PDFPage) -> None: - # 重载设置 page 的 obj_patch - # log.debug("Processing page: %r", page) - # print(page.mediabox,page.cropbox) - # (x0, y0, x1, y1) = page.mediabox - (x0, y0, x1, y1) = page.cropbox - if page.rotate == 90: - ctm = (0, -1, 1, 0, -y0, x1) - elif page.rotate == 180: - ctm = (-1, 0, 0, -1, x1, y1) - elif page.rotate == 270: - ctm = (0, 1, -1, 0, y1, -x0) - else: - ctm = (1, 0, 0, 1, -x0, -y0) - self.device.begin_page(page, ctm) - ops_base = self.render_contents(page.resources, page.contents, ctm=ctm) - self.device.fontid = self.fontid - self.device.fontmap = self.fontmap - ops_new = self.device.end_page(page) - # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来 - self.obj_patch[page.page_xref] = ( - f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵 - ) - for obj in page.contents: - self.obj_patch[obj.objid] = "" - - def render_contents( - self, - resources: Dict[object, object], - streams: Sequence[object], - ctm: Matrix = MATRIX_IDENTITY, - ) -> None: - # 重载返回指令流 - """Render the content streams. - - This method may be called recursively. - """ - # log.debug( - # "render_contents: resources=%r, streams=%r, ctm=%r", - # resources, - # streams, - # ctm, - # ) - self.init_resources(resources) - self.init_state(ctm) - return self.execute(list_value(streams)) - - def execute(self, streams: Sequence[object]) -> None: - # 重载返回指令流 - ops = "" - try: - parser = PDFContentParser(streams) - except PSEOF: - # empty page - return - while True: - try: - (_, obj) = parser.nextobject() - except PSEOF: - break - if isinstance(obj, PSKeyword): - name = keyword_name(obj) - method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace( - "'", - "_q", - ) - if hasattr(self, method): - func = getattr(self, method) - nargs = func.__code__.co_argcount - 1 - if nargs: - args = self.pop(nargs) - # log.debug("exec: %s %r", name, args) - if len(args) == nargs: - func(*args) - if not ( - name[0] == "T" - or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"] - ): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令 - p = " ".join( - [ - ( - f"{x:f}" - if isinstance(x, float) - else str(x).replace("'", "") - ) - for x in args - ] - ) - ops += f"{p} {name} " - else: - # log.debug("exec: %s", name) - targs = func() - if targs is None: - targs = [] - if not (name[0] == "T" or name in ["BI", "ID", "EMC"]): - p = " ".join( - [ - ( - f"{x:f}" - if isinstance(x, float) - else str(x).replace("'", "") - ) - for x in targs - ] - ) - ops += f"{p} {name} " - elif settings.STRICT: - error_msg = "Unknown operator: %r" % name - raise PDFInterpreterError(error_msg) - else: - self.push(obj) - # print('REV DATA',ops) - return ops +import logging +from typing import Any, Dict, Optional, Sequence, Tuple, cast +import numpy as np + +from pdfminer import settings +from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace +from pdfminer.pdfdevice import PDFDevice +from pdfminer.pdfinterp import ( + PDFPageInterpreter, + PDFResourceManager, + PDFContentParser, + PDFInterpreterError, + Color, + PDFStackT, + LITERAL_FORM, + LITERAL_IMAGE, +) +from pdfminer.pdffont import PDFFont +from pdfminer.pdfpage import PDFPage +from pdfminer.pdftypes import ( + PDFObjRef, + dict_value, + list_value, + resolve1, + stream_value, +) +from pdfminer.psexceptions import PSEOF +from pdfminer.psparser import ( + PSKeyword, + keyword_name, + literal_name, +) +from pdfminer.utils import ( + MATRIX_IDENTITY, + Matrix, + Rect, + mult_matrix, + apply_matrix_pt, +) + +log = logging.getLogger(__name__) + + +def safe_float(o: Any) -> Optional[float]: + try: + return float(o) + except (TypeError, ValueError): + return None + + +class PDFPageInterpreterEx(PDFPageInterpreter): + """Processor for the content of a PDF page + + Reference: PDF Reference, Appendix A, Operator Summary + """ + + def __init__( + self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch + ) -> None: + self.rsrcmgr = rsrcmgr + self.device = device + self.obj_patch = obj_patch + + def dup(self) -> "PDFPageInterpreterEx": + return self.__class__(self.rsrcmgr, self.device, self.obj_patch) + + def init_resources(self, resources: Dict[object, object]) -> None: + # 重载设置 fontid 和 descent + """Prepare the fonts and XObjects listed in the Resource attribute.""" + self.resources = resources + self.fontmap: Dict[object, PDFFont] = {} + self.fontid: Dict[PDFFont, object] = {} + self.xobjmap = {} + self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() + if not resources: + return + + def get_colorspace(spec: object) -> Optional[PDFColorSpace]: + if isinstance(spec, list): + name = literal_name(spec[0]) + else: + name = literal_name(spec) + if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2: + return PDFColorSpace(name, stream_value(spec[1])["N"]) + elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2: + return PDFColorSpace(name, len(list_value(spec[1]))) + else: + return PREDEFINED_COLORSPACE.get(name) + + for k, v in dict_value(resources).items(): + # log.debug("Resource: %r: %r", k, v) + if k == "Font": + for fontid, spec in dict_value(v).items(): + objid = None + if isinstance(spec, PDFObjRef): + objid = spec.objid + spec = dict_value(spec) + self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) + self.fontmap[fontid].descent = 0 # hack fix descent + self.fontid[self.fontmap[fontid]] = fontid + elif k == "ColorSpace": + for csid, spec in dict_value(v).items(): + colorspace = get_colorspace(resolve1(spec)) + if colorspace is not None: + self.csmap[csid] = colorspace + elif k == "ProcSet": + self.rsrcmgr.get_procset(list_value(v)) + elif k == "XObject": + for xobjid, xobjstrm in dict_value(v).items(): + self.xobjmap[xobjid] = xobjstrm + + def do_S(self) -> None: + # 重载过滤非公式线条 + """Stroke path""" + + def is_black(color: Color) -> bool: + if isinstance(color, Tuple): + return sum(color) == 0 + else: + return color == 0 + + if ( + len(self.curpath) == 2 + and self.curpath[0][0] == "m" + and self.curpath[1][0] == "l" + and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1] + == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1] + and is_black(self.graphicstate.scolor) + ): # 独立直线,水平,黑色 + # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor) + self.device.paint_path(self.graphicstate, True, False, False, self.curpath) + self.curpath = [] + return "n" + else: + self.curpath = [] + + ############################################################ + # 重载过滤非公式线条(F/B) + def do_f(self) -> None: + """Fill path using nonzero winding number rule""" + # self.device.paint_path(self.graphicstate, False, True, False, self.curpath) + self.curpath = [] + + def do_F(self) -> None: + """Fill path using nonzero winding number rule (obsolete)""" + + def do_f_a(self) -> None: + """Fill path using even-odd rule""" + # self.device.paint_path(self.graphicstate, False, True, True, self.curpath) + self.curpath = [] + + def do_B(self) -> None: + """Fill and stroke path using nonzero winding number rule""" + # self.device.paint_path(self.graphicstate, True, True, False, self.curpath) + self.curpath = [] + + def do_B_a(self) -> None: + """Fill and stroke path using even-odd rule""" + # self.device.paint_path(self.graphicstate, True, True, True, self.curpath) + self.curpath = [] + + ############################################################ + # 重载返回调用参数(SCN) + def do_SCN(self) -> None: + """Set color for stroking operations.""" + if self.scs: + n = self.scs.ncomponents + else: + if settings.STRICT: + raise PDFInterpreterError("No colorspace specified!") + n = 1 + args = self.pop(n) + self.graphicstate.scolor = cast(Color, args) + return args + + def do_scn(self) -> None: + """Set color for nonstroking operations""" + if self.ncs: + n = self.ncs.ncomponents + else: + if settings.STRICT: + raise PDFInterpreterError("No colorspace specified!") + n = 1 + args = self.pop(n) + self.graphicstate.ncolor = cast(Color, args) + return args + + def do_SC(self) -> None: + """Set color for stroking operations""" + return self.do_SCN() + + def do_sc(self) -> None: + """Set color for nonstroking operations""" + return self.do_scn() + + def do_Do(self, xobjid_arg: PDFStackT) -> None: + # 重载设置 xobj 的 obj_patch + """Invoke named XObject""" + xobjid = literal_name(xobjid_arg) + try: + xobj = stream_value(self.xobjmap[xobjid]) + except KeyError: + if settings.STRICT: + raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) + return + # log.debug("Processing xobj: %r", xobj) + subtype = xobj.get("Subtype") + if subtype is LITERAL_FORM and "BBox" in xobj: + interpreter = self.dup() + bbox = cast(Rect, list_value(xobj["BBox"])) + matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY))) + # According to PDF reference 1.7 section 4.9.1, XObjects in + # earlier PDFs (prior to v1.2) use the page's Resources entry + # instead of having their own Resources entry. + xobjres = xobj.get("Resources") + if xobjres: + resources = dict_value(xobjres) + else: + resources = self.resources.copy() + self.device.begin_figure(xobjid, bbox, matrix) + ctm = mult_matrix(matrix, self.ctm) + ops_base = interpreter.render_contents( + resources, + [xobj], + ctm=ctm, + ) + try: # 有的时候 form 字体加不上这里会烂掉 + self.device.fontid = interpreter.fontid + self.device.fontmap = interpreter.fontmap + ops_new = self.device.end_figure(xobjid) + ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2)) + np_version = np.__version__ + if np_version.split(".")[0] >= "2": + pos_inv = -np.asmatrix(ctm[4:]) * ctm_inv + else: + pos_inv = -np.mat(ctm[4:]) * ctm_inv + a, b, c, d = ctm_inv.reshape(4).tolist() + e, f = pos_inv.tolist()[0] + self.obj_patch[self.xobjmap[xobjid].objid] = ( + f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}" + ) + except Exception: + pass + elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: + self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) + self.device.render_image(xobjid, xobj) + self.device.end_figure(xobjid) + else: + # unsupported xobject type. + pass + + def process_page(self, page: PDFPage) -> None: + # 重载设置 page 的 obj_patch + # log.debug("Processing page: %r", page) + # print(page.mediabox,page.cropbox) + # (x0, y0, x1, y1) = page.mediabox + (x0, y0, x1, y1) = page.cropbox + if page.rotate == 90: + ctm = (0, -1, 1, 0, -y0, x1) + elif page.rotate == 180: + ctm = (-1, 0, 0, -1, x1, y1) + elif page.rotate == 270: + ctm = (0, 1, -1, 0, y1, -x0) + else: + ctm = (1, 0, 0, 1, -x0, -y0) + self.device.begin_page(page, ctm) + ops_base = self.render_contents(page.resources, page.contents, ctm=ctm) + self.device.fontid = self.fontid + self.device.fontmap = self.fontmap + ops_new = self.device.end_page(page) + # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来 + self.obj_patch[page.page_xref] = ( + f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵 + ) + for obj in page.contents: + self.obj_patch[obj.objid] = "" + + def render_contents( + self, + resources: Dict[object, object], + streams: Sequence[object], + ctm: Matrix = MATRIX_IDENTITY, + ) -> None: + # 重载返回指令流 + """Render the content streams. + + This method may be called recursively. + """ + # log.debug( + # "render_contents: resources=%r, streams=%r, ctm=%r", + # resources, + # streams, + # ctm, + # ) + self.init_resources(resources) + self.init_state(ctm) + return self.execute(list_value(streams)) + + def execute(self, streams: Sequence[object]) -> None: + # 重载返回指令流 + ops = "" + try: + parser = PDFContentParser(streams) + except PSEOF: + # empty page + return + while True: + try: + (_, obj) = parser.nextobject() + except PSEOF: + break + if isinstance(obj, PSKeyword): + name = keyword_name(obj) + method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace( + "'", + "_q", + ) + if hasattr(self, method): + func = getattr(self, method) + nargs = func.__code__.co_argcount - 1 + if nargs: + args = self.pop(nargs) + # log.debug("exec: %s %r", name, args) + if len(args) == nargs: + func(*args) + if not ( + name[0] == "T" + or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"] + ): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令 + p = " ".join( + [ + ( + f"{x:f}" + if isinstance(x, float) + else str(x).replace("'", "") + ) + for x in args + ] + ) + ops += f"{p} {name} " + else: + # log.debug("exec: %s", name) + targs = func() + if targs is None: + targs = [] + if not (name[0] == "T" or name in ["BI", "ID", "EMC"]): + p = " ".join( + [ + ( + f"{x:f}" + if isinstance(x, float) + else str(x).replace("'", "") + ) + for x in targs + ] + ) + ops += f"{p} {name} " + elif settings.STRICT: + error_msg = "Unknown operator: %r" % name + raise PDFInterpreterError(error_msg) + else: + self.push(obj) + # print('REV DATA',ops) + return ops diff --git a/pdf2zh/translator.py b/pdf2zh/translator.py index 2c3693a2..89e461ba 100644 --- a/pdf2zh/translator.py +++ b/pdf2zh/translator.py @@ -1,806 +1,806 @@ -import html -import logging -import os -import re -import unicodedata -from copy import copy -import deepl -import ollama -import openai -import xinference_client -import requests -from pdf2zh.cache import TranslationCache -from azure.ai.translation.text import TextTranslationClient -from azure.core.credentials import AzureKeyCredential -from tencentcloud.common import credential -from tencentcloud.tmt.v20180321.tmt_client import TmtClient -from tencentcloud.tmt.v20180321.models import TextTranslateRequest -from tencentcloud.tmt.v20180321.models import TextTranslateResponse -import argostranslate.package -import argostranslate.translate - -import json -from pdf2zh.config import ConfigManager - - -def remove_control_characters(s): - return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C") - - -class BaseTranslator: - name = "base" - envs = {} - lang_map = {} - CustomPrompt = False - ignore_cache = False - - def __init__(self, lang_in, lang_out, model): - lang_in = self.lang_map.get(lang_in.lower(), lang_in) - lang_out = self.lang_map.get(lang_out.lower(), lang_out) - self.lang_in = lang_in - self.lang_out = lang_out - self.model = model - - self.cache = TranslationCache( - self.name, - { - "lang_in": lang_in, - "lang_out": lang_out, - "model": model, - }, - ) - - def set_envs(self, envs): - # Detach from self.__class__.envs - # Cannot use self.envs = copy(self.__class__.envs) - # because if set_envs called twice, the second call will override the first call - self.envs = copy(self.envs) - if ConfigManager.get_translator_by_name(self.name): - self.envs = ConfigManager.get_translator_by_name(self.name) - needUpdate = False - for key in self.envs: - if key in os.environ: - self.envs[key] = os.environ[key] - needUpdate = True - if needUpdate: - ConfigManager.set_translator_by_name(self.name, self.envs) - if envs is not None: - for key in envs: - self.envs[key] = envs[key] - ConfigManager.set_translator_by_name(self.name, self.envs) - - def add_cache_impact_parameters(self, k: str, v): - """ - Add parameters that affect the translation quality to distinguish the translation effects under different parameters. - :param k: key - :param v: value - """ - self.cache.add_params(k, v) - - def translate(self, text, ignore_cache=False): - """ - Translate the text, and the other part should call this method. - :param text: text to translate - :return: translated text - """ - if not (self.ignore_cache or ignore_cache): - cache = self.cache.get(text) - if cache is not None: - return cache - - translation = self.do_translate(text) - self.cache.set(text, translation) - return translation - - def do_translate(self, text): - """ - Actual translate text, override this method - :param text: text to translate - :return: translated text - """ - raise NotImplementedError - - def prompt(self, text, prompt): - if prompt: - context = { - "lang_in": self.lang_in, - "lang_out": self.lang_out, - "text": text, - } - return eval(prompt.safe_substitute(context)) - else: - return [ - { - "role": "system", - "content": "You are a professional,authentic machine translation engine.", - }, - { - "role": "user", - "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501 - }, - ] - - def __str__(self): - return f"{self.name} {self.lang_in} {self.lang_out} {self.model}" - - -class GoogleTranslator(BaseTranslator): - name = "google" - lang_map = {"zh": "zh-CN"} - - def __init__(self, lang_in, lang_out, model, **kwargs): - super().__init__(lang_in, lang_out, model) - self.session = requests.Session() - self.endpoint = "http://translate.google.com/m" - self.headers = { - "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501 - } - - def do_translate(self, text): - text = text[:5000] # google translate max length - response = self.session.get( - self.endpoint, - params={"tl": self.lang_out, "sl": self.lang_in, "q": text}, - headers=self.headers, - ) - re_result = re.findall( - r'(?s)class="(?:t0|result-container)">(.*?)<', response.text - ) - if response.status_code == 400: - result = "IRREPARABLE TRANSLATION ERROR" - else: - response.raise_for_status() - result = html.unescape(re_result[0]) - return remove_control_characters(result) - - -class BingTranslator(BaseTranslator): - # https://github.com/immersive-translate/old-immersive-translate/blob/6df13da22664bea2f51efe5db64c63aca59c4e79/src/background/translationService.js - name = "bing" - lang_map = {"zh": "zh-Hans"} - - def __init__(self, lang_in, lang_out, model, **kwargs): - super().__init__(lang_in, lang_out, model) - self.session = requests.Session() - self.endpoint = "https://www.bing.com/translator" - self.headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", # noqa: E501 - } - - def find_sid(self): - response = self.session.get(self.endpoint) - response.raise_for_status() - url = response.url[:-10] - ig = re.findall(r"\"ig\":\"(.*?)\"", response.text)[0] - iid = re.findall(r"data-iid=\"(.*?)\"", response.text)[-1] - key, token = re.findall( - r"params_AbusePreventionHelper\s=\s\[(.*?),\"(.*?)\",", response.text - )[0] - return url, ig, iid, key, token - - def do_translate(self, text): - text = text[:1000] # bing translate max length - url, ig, iid, key, token = self.find_sid() - response = self.session.post( - f"{url}ttranslatev3?IG={ig}&IID={iid}", - data={ - "fromLang": self.lang_in, - "to": self.lang_out, - "text": text, - "token": token, - "key": key, - }, - headers=self.headers, - ) - response.raise_for_status() - return response.json()[0]["translations"][0]["text"] - - -class DeepLTranslator(BaseTranslator): - # https://github.com/DeepLcom/deepl-python - name = "deepl" - envs = { - "DEEPL_AUTH_KEY": None, - } - lang_map = {"zh": "zh-Hans"} - - def __init__(self, lang_in, lang_out, model, envs=None, **kwargs): - self.set_envs(envs) - super().__init__(lang_in, lang_out, model) - auth_key = self.envs["DEEPL_AUTH_KEY"] - self.client = deepl.Translator(auth_key) - - def do_translate(self, text): - response = self.client.translate_text( - text, target_lang=self.lang_out, source_lang=self.lang_in - ) - return response.text - - -class DeepLXTranslator(BaseTranslator): - # https://deeplx.owo.network/endpoints/free.html - name = "deeplx" - envs = { - "DEEPLX_ENDPOINT": "https://api.deepl.com/translate", - "DEEPLX_ACCESS_TOKEN": None, - } - lang_map = {"zh": "zh-Hans"} - - def __init__(self, lang_in, lang_out, model, envs=None, **kwargs): - self.set_envs(envs) - super().__init__(lang_in, lang_out, model) - self.endpoint = self.envs["DEEPLX_ENDPOINT"] - self.session = requests.Session() - auth_key = self.envs["DEEPLX_ACCESS_TOKEN"] - if auth_key: - self.endpoint = f"{self.endpoint}?token={auth_key}" - - def do_translate(self, text): - response = self.session.post( - self.endpoint, - json={ - "source_lang": self.lang_in, - "target_lang": self.lang_out, - "text": text, - }, - ) - response.raise_for_status() - return response.json()["data"] - - -class OllamaTranslator(BaseTranslator): - # https://github.com/ollama/ollama-python - name = "ollama" - envs = { - "OLLAMA_HOST": "http://127.0.0.1:11434", - "OLLAMA_MODEL": "gemma2", - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - if not model: - model = self.envs["OLLAMA_MODEL"] - super().__init__(lang_in, lang_out, model) - self.options = {"temperature": 0} # 随机采样可能会打断公式标记 - self.client = ollama.Client() - self.prompttext = prompt - self.add_cache_impact_parameters("temperature", self.options["temperature"]) - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - def do_translate(self, text): - maxlen = max(2000, len(text) * 5) - for model in self.model.split(";"): - try: - response = "" - stream = self.client.chat( - model=model, - options=self.options, - messages=self.prompt(text, self.prompttext), - stream=True, - ) - for chunk in stream: - chunk = chunk["message"]["content"] - response += chunk - if len(response) > maxlen: - raise Exception("Response too long") - return response.strip() - except Exception as e: - print(e) - raise Exception("All models failed") - - -class XinferenceTranslator(BaseTranslator): - # https://github.com/xorbitsai/inference - name = "xinference" - envs = { - "XINFERENCE_HOST": "http://127.0.0.1:9997", - "XINFERENCE_MODEL": "gemma-2-it", - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - if not model: - model = self.envs["XINFERENCE_MODEL"] - super().__init__(lang_in, lang_out, model) - self.options = {"temperature": 0} # 随机采样可能会打断公式标记 - self.client = xinference_client.RESTfulClient(self.envs["XINFERENCE_HOST"]) - self.prompttext = prompt - self.add_cache_impact_parameters("temperature", self.options["temperature"]) - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - def do_translate(self, text): - maxlen = max(2000, len(text) * 5) - for model in self.model.split(";"): - try: - xf_model = self.client.get_model(model) - xf_prompt = self.prompt(text, self.prompttext) - xf_prompt = [ - { - "role": "user", - "content": xf_prompt[0]["content"] - + "\n" - + xf_prompt[1]["content"], - } - ] - response = xf_model.chat( - generate_config=self.options, - messages=xf_prompt, - ) - - response = response["choices"][0]["message"]["content"].replace( - "", "" - ) - if len(response) > maxlen: - raise Exception("Response too long") - return response.strip() - except Exception as e: - print(e) - raise Exception("All models failed") - - -class OpenAITranslator(BaseTranslator): - # https://github.com/openai/openai-python - name = "openai" - envs = { - "OPENAI_BASE_URL": "https://api.openai.com/v1", - "OPENAI_API_KEY": None, - "OPENAI_MODEL": "gpt-4o-mini", - } - CustomPrompt = True - - def __init__( - self, - lang_in, - lang_out, - model, - base_url=None, - api_key=None, - envs=None, - prompt=None, - ): - self.set_envs(envs) - if not model: - model = self.envs["OPENAI_MODEL"] - super().__init__(lang_in, lang_out, model) - self.options = {"temperature": 0} # 随机采样可能会打断公式标记 - self.client = openai.OpenAI( - base_url=base_url or self.envs["OPENAI_BASE_URL"], - api_key=api_key or self.envs["OPENAI_API_KEY"], - ) - self.prompttext = prompt - self.add_cache_impact_parameters("temperature", self.options["temperature"]) - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - def do_translate(self, text) -> str: - response = self.client.chat.completions.create( - model=self.model, - **self.options, - messages=self.prompt(text, self.prompttext), - ) - return response.choices[0].message.content.strip() - - -class AzureOpenAITranslator(BaseTranslator): - name = "azure-openai" - envs = { - "AZURE_OPENAI_BASE_URL": None, # e.g. "https://xxx.openai.azure.com" - "AZURE_OPENAI_API_KEY": None, - "AZURE_OPENAI_MODEL": "gpt-4o-mini", - } - CustomPrompt = True - - def __init__( - self, - lang_in, - lang_out, - model, - base_url=None, - api_key=None, - envs=None, - prompt=None, - ): - self.set_envs(envs) - base_url = self.envs["AZURE_OPENAI_BASE_URL"] - if not model: - model = self.envs["AZURE_OPENAI_MODEL"] - super().__init__(lang_in, lang_out, model) - self.options = {"temperature": 0} - self.client = openai.AzureOpenAI( - azure_endpoint=base_url, - azure_deployment=model, - api_version="2024-06-01", - api_key=api_key, - ) - self.prompttext = prompt - self.add_cache_impact_parameters("temperature", self.options["temperature"]) - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - def do_translate(self, text) -> str: - response = self.client.chat.completions.create( - model=self.model, - **self.options, - messages=self.prompt(text, self.prompttext), - ) - return response.choices[0].message.content.strip() - - -class ModelScopeTranslator(OpenAITranslator): - name = "modelscope" - envs = { - "MODELSCOPE_BASE_URL": "https://api-inference.modelscope.cn/v1", - "MODELSCOPE_API_KEY": None, - "MODELSCOPE_MODEL": "Qwen/Qwen2.5-32B-Instruct", - } - CustomPrompt = True - - def __init__( - self, - lang_in, - lang_out, - model, - base_url=None, - api_key=None, - envs=None, - prompt=None, - ): - self.set_envs(envs) - base_url = "https://api-inference.modelscope.cn/v1" - api_key = self.envs["MODELSCOPE_API_KEY"] - if not model: - model = self.envs["MODELSCOPE_MODEL"] - super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - -class ZhipuTranslator(OpenAITranslator): - # https://bigmodel.cn/dev/api/thirdparty-frame/openai-sdk - name = "zhipu" - envs = { - "ZHIPU_API_KEY": None, - "ZHIPU_MODEL": "glm-4-flash", - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - base_url = "https://open.bigmodel.cn/api/paas/v4" - api_key = self.envs["ZHIPU_API_KEY"] - if not model: - model = self.envs["ZHIPU_MODEL"] - super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - def do_translate(self, text) -> str: - try: - response = self.client.chat.completions.create( - model=self.model, - **self.options, - messages=self.prompt(text, self.prompttext), - ) - except openai.BadRequestError as e: - if ( - json.loads(response.choices[0].message.content.strip())["error"]["code"] - == "1301" - ): - return "IRREPARABLE TRANSLATION ERROR" - raise e - return response.choices[0].message.content.strip() - - -class SiliconTranslator(OpenAITranslator): - # https://docs.siliconflow.cn/quickstart - name = "silicon" - envs = { - "SILICON_API_KEY": None, - "SILICON_MODEL": "Qwen/Qwen2.5-7B-Instruct", - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - base_url = "https://api.siliconflow.cn/v1" - api_key = self.envs["SILICON_API_KEY"] - if not model: - model = self.envs["SILICON_MODEL"] - super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - -class GeminiTranslator(OpenAITranslator): - # https://ai.google.dev/gemini-api/docs/openai - name = "gemini" - envs = { - "GEMINI_API_KEY": None, - "GEMINI_MODEL": "gemini-1.5-flash", - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" - api_key = self.envs["GEMINI_API_KEY"] - if not model: - model = self.envs["GEMINI_MODEL"] - super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - -class AzureTranslator(BaseTranslator): - # https://github.com/Azure/azure-sdk-for-python - name = "azure" - envs = { - "AZURE_ENDPOINT": "https://api.translator.azure.cn", - "AZURE_API_KEY": None, - } - lang_map = {"zh": "zh-Hans"} - - def __init__(self, lang_in, lang_out, model, envs=None, **kwargs): - self.set_envs(envs) - super().__init__(lang_in, lang_out, model) - endpoint = self.envs["AZURE_ENDPOINT"] - api_key = self.envs["AZURE_API_KEY"] - credential = AzureKeyCredential(api_key) - self.client = TextTranslationClient( - endpoint=endpoint, credential=credential, region="chinaeast2" - ) - # https://github.com/Azure/azure-sdk-for-python/issues/9422 - logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy") - logger.setLevel(logging.WARNING) - - def do_translate(self, text) -> str: - response = self.client.translate( - body=[text], - from_language=self.lang_in, - to_language=[self.lang_out], - ) - translated_text = response[0].translations[0].text - return translated_text - - -class TencentTranslator(BaseTranslator): - # https://github.com/TencentCloud/tencentcloud-sdk-python - name = "tencent" - envs = { - "TENCENTCLOUD_SECRET_ID": None, - "TENCENTCLOUD_SECRET_KEY": None, - } - - def __init__(self, lang_in, lang_out, model, envs=None, **kwargs): - self.set_envs(envs) - super().__init__(lang_in, lang_out, model) - cred = credential.DefaultCredentialProvider().get_credential() - self.client = TmtClient(cred, "ap-beijing") - self.req = TextTranslateRequest() - self.req.Source = self.lang_in - self.req.Target = self.lang_out - self.req.ProjectId = 0 - - def do_translate(self, text): - self.req.SourceText = text - resp: TextTranslateResponse = self.client.TextTranslate(self.req) - return resp.TargetText - - -class AnythingLLMTranslator(BaseTranslator): - name = "anythingllm" - envs = { - "AnythingLLM_URL": None, - "AnythingLLM_APIKEY": None, - } - CustomPrompt = True - - def __init__(self, lang_out, lang_in, model, envs=None, prompt=None): - self.set_envs(envs) - super().__init__(lang_out, lang_in, model) - self.api_url = self.envs["AnythingLLM_URL"] - self.api_key = self.envs["AnythingLLM_APIKEY"] - self.headers = { - "accept": "application/json", - "Authorization": f"Bearer {self.api_key}", - "Content-Type": "application/json", - } - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - def do_translate(self, text): - messages = self.prompt(text, self.prompttext) - payload = { - "message": messages, - "mode": "chat", - "sessionId": "translation_expert", - } - - response = requests.post( - self.api_url, headers=self.headers, data=json.dumps(payload) - ) - response.raise_for_status() - data = response.json() - - if "textResponse" in data: - return data["textResponse"].strip() - - -class DifyTranslator(BaseTranslator): - name = "dify" - envs = { - "DIFY_API_URL": None, # 填写实际 Dify API 地址 - "DIFY_API_KEY": None, # 替换为实际 API 密钥 - } - - def __init__(self, lang_out, lang_in, model, envs=None, **kwargs): - self.set_envs(envs) - super().__init__(lang_out, lang_in, model) - self.api_url = self.envs["DIFY_API_URL"] - self.api_key = self.envs["DIFY_API_KEY"] - - def do_translate(self, text): - headers = { - "Authorization": f"Bearer {self.api_key}", - "Content-Type": "application/json", - } - - payload = { - "inputs": { - "lang_out": self.lang_out, - "lang_in": self.lang_in, - "text": text, - }, - "response_mode": "blocking", - "user": "translator-service", - } - - # 向 Dify 服务器发送请求 - response = requests.post( - self.api_url, headers=headers, data=json.dumps(payload) - ) - response.raise_for_status() - response_data = response.json() - - # 解析响应 - return response_data.get("data", {}).get("outputs", {}).get("text", []) - - -class ArgosTranslator(BaseTranslator): - name = "argos" - - def __init__(self, lang_in, lang_out, model, **kwargs): - super().__init__(lang_in, lang_out, model) - lang_in = self.lang_map.get(lang_in.lower(), lang_in) - lang_out = self.lang_map.get(lang_out.lower(), lang_out) - self.lang_in = lang_in - self.lang_out = lang_out - argostranslate.package.update_package_index() - available_packages = argostranslate.package.get_available_packages() - try: - available_package = list( - filter( - lambda x: x.from_code == self.lang_in - and x.to_code == self.lang_out, - available_packages, - ) - )[0] - except Exception: - raise ValueError( - "lang_in and lang_out pair not supported by Argos Translate." - ) - download_path = available_package.download() - argostranslate.package.install_from_path(download_path) - - def translate(self, text): - # Translate - installed_languages = argostranslate.translate.get_installed_languages() - from_lang = list(filter(lambda x: x.code == self.lang_in, installed_languages))[ - 0 - ] - to_lang = list(filter(lambda x: x.code == self.lang_out, installed_languages))[ - 0 - ] - translation = from_lang.get_translation(to_lang) - translatedText = translation.translate(text) - return translatedText - - -class GorkTranslator(OpenAITranslator): - # https://docs.x.ai/docs/overview#getting-started - name = "grok" - envs = { - "GORK_API_KEY": None, - "GORK_MODEL": "grok-2-1212", - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - base_url = "https://api.x.ai/v1" - api_key = self.envs["GORK_API_KEY"] - if not model: - model = self.envs["GORK_MODEL"] - super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - -class GroqTranslator(OpenAITranslator): - name = "groq" - envs = { - "GROQ_API_KEY": None, - "GROQ_MODEL": "llama-3-3-70b-versatile", - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - base_url = "https://api.groq.com/openai/v1" - api_key = self.envs["GROQ_API_KEY"] - if not model: - model = self.envs["GROQ_MODEL"] - super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - -class DeepseekTranslator(OpenAITranslator): - name = "deepseek" - envs = { - "DEEPSEEK_API_KEY": None, - "DEEPSEEK_MODEL": "deepseek-chat", - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - base_url = "https://api.deepseek.com/v1" - api_key = self.envs["DEEPSEEK_API_KEY"] - if not model: - model = self.envs["DEEPSEEK_MODEL"] - super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) - - -class OpenAIlikedTranslator(OpenAITranslator): - name = "openailiked" - envs = { - "OPENAILIKED_BASE_URL": None, - "OPENAILIKED_API_KEY": None, - "OPENAILIKED_MODEL": None, - } - CustomPrompt = True - - def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): - self.set_envs(envs) - if self.envs["OPENAILIKED_BASE_URL"]: - base_url = self.envs["OPENAILIKED_BASE_URL"] - else: - raise ValueError("The OPENAILIKED_BASE_URL is missing.") - if not model: - if self.envs["OPENAILIKED_MODEL"]: - model = self.envs["OPENAILIKED_MODEL"] - else: - raise ValueError("The OPENAILIKED_MODEL is missing.") - if self.envs["OPENAILIKED_API_KEY"] is None: - api_key = "openailiked" - else: - api_key = self.envs["OPENAILIKED_API_KEY"] - super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) - self.prompttext = prompt - if prompt: - self.add_cache_impact_parameters("prompt", prompt.template) +import html +import logging +import os +import re +import unicodedata +from copy import copy +import deepl +import ollama +import openai +import xinference_client +import requests +from pdf2zh.cache import TranslationCache +from azure.ai.translation.text import TextTranslationClient +from azure.core.credentials import AzureKeyCredential +from tencentcloud.common import credential +from tencentcloud.tmt.v20180321.tmt_client import TmtClient +from tencentcloud.tmt.v20180321.models import TextTranslateRequest +from tencentcloud.tmt.v20180321.models import TextTranslateResponse +import argostranslate.package +import argostranslate.translate + +import json +from pdf2zh.config import ConfigManager + + +def remove_control_characters(s): + return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C") + + +class BaseTranslator: + name = "base" + envs = {} + lang_map = {} + CustomPrompt = False + ignore_cache = False + + def __init__(self, lang_in, lang_out, model): + lang_in = self.lang_map.get(lang_in.lower(), lang_in) + lang_out = self.lang_map.get(lang_out.lower(), lang_out) + self.lang_in = lang_in + self.lang_out = lang_out + self.model = model + + self.cache = TranslationCache( + self.name, + { + "lang_in": lang_in, + "lang_out": lang_out, + "model": model, + }, + ) + + def set_envs(self, envs): + # Detach from self.__class__.envs + # Cannot use self.envs = copy(self.__class__.envs) + # because if set_envs called twice, the second call will override the first call + self.envs = copy(self.envs) + if ConfigManager.get_translator_by_name(self.name): + self.envs = ConfigManager.get_translator_by_name(self.name) + needUpdate = False + for key in self.envs: + if key in os.environ: + self.envs[key] = os.environ[key] + needUpdate = True + if needUpdate: + ConfigManager.set_translator_by_name(self.name, self.envs) + if envs is not None: + for key in envs: + self.envs[key] = envs[key] + ConfigManager.set_translator_by_name(self.name, self.envs) + + def add_cache_impact_parameters(self, k: str, v): + """ + Add parameters that affect the translation quality to distinguish the translation effects under different parameters. + :param k: key + :param v: value + """ + self.cache.add_params(k, v) + + def translate(self, text, ignore_cache=False): + """ + Translate the text, and the other part should call this method. + :param text: text to translate + :return: translated text + """ + if not (self.ignore_cache or ignore_cache): + cache = self.cache.get(text) + if cache is not None: + return cache + + translation = self.do_translate(text) + self.cache.set(text, translation) + return translation + + def do_translate(self, text): + """ + Actual translate text, override this method + :param text: text to translate + :return: translated text + """ + raise NotImplementedError + + def prompt(self, text, prompt): + if prompt: + context = { + "lang_in": self.lang_in, + "lang_out": self.lang_out, + "text": text, + } + return eval(prompt.safe_substitute(context)) + else: + return [ + { + "role": "system", + "content": "You are a professional,authentic machine translation engine.", + }, + { + "role": "user", + "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501 + }, + ] + + def __str__(self): + return f"{self.name} {self.lang_in} {self.lang_out} {self.model}" + + +class GoogleTranslator(BaseTranslator): + name = "google" + lang_map = {"zh": "zh-CN"} + + def __init__(self, lang_in, lang_out, model, **kwargs): + super().__init__(lang_in, lang_out, model) + self.session = requests.Session() + self.endpoint = "http://translate.google.com/m" + self.headers = { + "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501 + } + + def do_translate(self, text): + text = text[:5000] # google translate max length + response = self.session.get( + self.endpoint, + params={"tl": self.lang_out, "sl": self.lang_in, "q": text}, + headers=self.headers, + ) + re_result = re.findall( + r'(?s)class="(?:t0|result-container)">(.*?)<', response.text + ) + if response.status_code == 400: + result = "IRREPARABLE TRANSLATION ERROR" + else: + response.raise_for_status() + result = html.unescape(re_result[0]) + return remove_control_characters(result) + + +class BingTranslator(BaseTranslator): + # https://github.com/immersive-translate/old-immersive-translate/blob/6df13da22664bea2f51efe5db64c63aca59c4e79/src/background/translationService.js + name = "bing" + lang_map = {"zh": "zh-Hans"} + + def __init__(self, lang_in, lang_out, model, **kwargs): + super().__init__(lang_in, lang_out, model) + self.session = requests.Session() + self.endpoint = "https://www.bing.com/translator" + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", # noqa: E501 + } + + def find_sid(self): + response = self.session.get(self.endpoint) + response.raise_for_status() + url = response.url[:-10] + ig = re.findall(r"\"ig\":\"(.*?)\"", response.text)[0] + iid = re.findall(r"data-iid=\"(.*?)\"", response.text)[-1] + key, token = re.findall( + r"params_AbusePreventionHelper\s=\s\[(.*?),\"(.*?)\",", response.text + )[0] + return url, ig, iid, key, token + + def do_translate(self, text): + text = text[:1000] # bing translate max length + url, ig, iid, key, token = self.find_sid() + response = self.session.post( + f"{url}ttranslatev3?IG={ig}&IID={iid}", + data={ + "fromLang": self.lang_in, + "to": self.lang_out, + "text": text, + "token": token, + "key": key, + }, + headers=self.headers, + ) + response.raise_for_status() + return response.json()[0]["translations"][0]["text"] + + +class DeepLTranslator(BaseTranslator): + # https://github.com/DeepLcom/deepl-python + name = "deepl" + envs = { + "DEEPL_AUTH_KEY": None, + } + lang_map = {"zh": "zh-Hans"} + + def __init__(self, lang_in, lang_out, model, envs=None, **kwargs): + self.set_envs(envs) + super().__init__(lang_in, lang_out, model) + auth_key = self.envs["DEEPL_AUTH_KEY"] + self.client = deepl.Translator(auth_key) + + def do_translate(self, text): + response = self.client.translate_text( + text, target_lang=self.lang_out, source_lang=self.lang_in + ) + return response.text + + +class DeepLXTranslator(BaseTranslator): + # https://deeplx.owo.network/endpoints/free.html + name = "deeplx" + envs = { + "DEEPLX_ENDPOINT": "https://api.deepl.com/translate", + "DEEPLX_ACCESS_TOKEN": None, + } + lang_map = {"zh": "zh-Hans"} + + def __init__(self, lang_in, lang_out, model, envs=None, **kwargs): + self.set_envs(envs) + super().__init__(lang_in, lang_out, model) + self.endpoint = self.envs["DEEPLX_ENDPOINT"] + self.session = requests.Session() + auth_key = self.envs["DEEPLX_ACCESS_TOKEN"] + if auth_key: + self.endpoint = f"{self.endpoint}?token={auth_key}" + + def do_translate(self, text): + response = self.session.post( + self.endpoint, + json={ + "source_lang": self.lang_in, + "target_lang": self.lang_out, + "text": text, + }, + ) + response.raise_for_status() + return response.json()["data"] + + +class OllamaTranslator(BaseTranslator): + # https://github.com/ollama/ollama-python + name = "ollama" + envs = { + "OLLAMA_HOST": "http://127.0.0.1:11434", + "OLLAMA_MODEL": "gemma2", + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + if not model: + model = self.envs["OLLAMA_MODEL"] + super().__init__(lang_in, lang_out, model) + self.options = {"temperature": 0} # 随机采样可能会打断公式标记 + self.client = ollama.Client() + self.prompttext = prompt + self.add_cache_impact_parameters("temperature", self.options["temperature"]) + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + def do_translate(self, text): + maxlen = max(2000, len(text) * 5) + for model in self.model.split(";"): + try: + response = "" + stream = self.client.chat( + model=model, + options=self.options, + messages=self.prompt(text, self.prompttext), + stream=True, + ) + for chunk in stream: + chunk = chunk["message"]["content"] + response += chunk + if len(response) > maxlen: + raise Exception("Response too long") + return response.strip() + except Exception as e: + print(e) + raise Exception("All models failed") + + +class XinferenceTranslator(BaseTranslator): + # https://github.com/xorbitsai/inference + name = "xinference" + envs = { + "XINFERENCE_HOST": "http://127.0.0.1:9997", + "XINFERENCE_MODEL": "gemma-2-it", + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + if not model: + model = self.envs["XINFERENCE_MODEL"] + super().__init__(lang_in, lang_out, model) + self.options = {"temperature": 0} # 随机采样可能会打断公式标记 + self.client = xinference_client.RESTfulClient(self.envs["XINFERENCE_HOST"]) + self.prompttext = prompt + self.add_cache_impact_parameters("temperature", self.options["temperature"]) + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + def do_translate(self, text): + maxlen = max(2000, len(text) * 5) + for model in self.model.split(";"): + try: + xf_model = self.client.get_model(model) + xf_prompt = self.prompt(text, self.prompttext) + xf_prompt = [ + { + "role": "user", + "content": xf_prompt[0]["content"] + + "\n" + + xf_prompt[1]["content"], + } + ] + response = xf_model.chat( + generate_config=self.options, + messages=xf_prompt, + ) + + response = response["choices"][0]["message"]["content"].replace( + "", "" + ) + if len(response) > maxlen: + raise Exception("Response too long") + return response.strip() + except Exception as e: + print(e) + raise Exception("All models failed") + + +class OpenAITranslator(BaseTranslator): + # https://github.com/openai/openai-python + name = "openai" + envs = { + "OPENAI_BASE_URL": "https://api.openai.com/v1", + "OPENAI_API_KEY": None, + "OPENAI_MODEL": "gpt-4o-mini", + } + CustomPrompt = True + + def __init__( + self, + lang_in, + lang_out, + model, + base_url=None, + api_key=None, + envs=None, + prompt=None, + ): + self.set_envs(envs) + if not model: + model = self.envs["OPENAI_MODEL"] + super().__init__(lang_in, lang_out, model) + self.options = {"temperature": 0} # 随机采样可能会打断公式标记 + self.client = openai.OpenAI( + base_url=base_url or self.envs["OPENAI_BASE_URL"], + api_key=api_key or self.envs["OPENAI_API_KEY"], + ) + self.prompttext = prompt + self.add_cache_impact_parameters("temperature", self.options["temperature"]) + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + def do_translate(self, text) -> str: + response = self.client.chat.completions.create( + model=self.model, + **self.options, + messages=self.prompt(text, self.prompttext), + ) + return response.choices[0].message.content.strip() + + +class AzureOpenAITranslator(BaseTranslator): + name = "azure-openai" + envs = { + "AZURE_OPENAI_BASE_URL": None, # e.g. "https://xxx.openai.azure.com" + "AZURE_OPENAI_API_KEY": None, + "AZURE_OPENAI_MODEL": "gpt-4o-mini", + } + CustomPrompt = True + + def __init__( + self, + lang_in, + lang_out, + model, + base_url=None, + api_key=None, + envs=None, + prompt=None, + ): + self.set_envs(envs) + base_url = self.envs["AZURE_OPENAI_BASE_URL"] + if not model: + model = self.envs["AZURE_OPENAI_MODEL"] + super().__init__(lang_in, lang_out, model) + self.options = {"temperature": 0} + self.client = openai.AzureOpenAI( + azure_endpoint=base_url, + azure_deployment=model, + api_version="2024-06-01", + api_key=api_key, + ) + self.prompttext = prompt + self.add_cache_impact_parameters("temperature", self.options["temperature"]) + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + def do_translate(self, text) -> str: + response = self.client.chat.completions.create( + model=self.model, + **self.options, + messages=self.prompt(text, self.prompttext), + ) + return response.choices[0].message.content.strip() + + +class ModelScopeTranslator(OpenAITranslator): + name = "modelscope" + envs = { + "MODELSCOPE_BASE_URL": "https://api-inference.modelscope.cn/v1", + "MODELSCOPE_API_KEY": None, + "MODELSCOPE_MODEL": "Qwen/Qwen2.5-32B-Instruct", + } + CustomPrompt = True + + def __init__( + self, + lang_in, + lang_out, + model, + base_url=None, + api_key=None, + envs=None, + prompt=None, + ): + self.set_envs(envs) + base_url = "https://api-inference.modelscope.cn/v1" + api_key = self.envs["MODELSCOPE_API_KEY"] + if not model: + model = self.envs["MODELSCOPE_MODEL"] + super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + +class ZhipuTranslator(OpenAITranslator): + # https://bigmodel.cn/dev/api/thirdparty-frame/openai-sdk + name = "zhipu" + envs = { + "ZHIPU_API_KEY": None, + "ZHIPU_MODEL": "glm-4-flash", + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + base_url = "https://open.bigmodel.cn/api/paas/v4" + api_key = self.envs["ZHIPU_API_KEY"] + if not model: + model = self.envs["ZHIPU_MODEL"] + super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + def do_translate(self, text) -> str: + try: + response = self.client.chat.completions.create( + model=self.model, + **self.options, + messages=self.prompt(text, self.prompttext), + ) + except openai.BadRequestError as e: + if ( + json.loads(response.choices[0].message.content.strip())["error"]["code"] + == "1301" + ): + return "IRREPARABLE TRANSLATION ERROR" + raise e + return response.choices[0].message.content.strip() + + +class SiliconTranslator(OpenAITranslator): + # https://docs.siliconflow.cn/quickstart + name = "silicon" + envs = { + "SILICON_API_KEY": None, + "SILICON_MODEL": "Qwen/Qwen2.5-7B-Instruct", + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + base_url = "https://api.siliconflow.cn/v1" + api_key = self.envs["SILICON_API_KEY"] + if not model: + model = self.envs["SILICON_MODEL"] + super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + +class GeminiTranslator(OpenAITranslator): + # https://ai.google.dev/gemini-api/docs/openai + name = "gemini" + envs = { + "GEMINI_API_KEY": None, + "GEMINI_MODEL": "gemini-1.5-flash", + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" + api_key = self.envs["GEMINI_API_KEY"] + if not model: + model = self.envs["GEMINI_MODEL"] + super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + +class AzureTranslator(BaseTranslator): + # https://github.com/Azure/azure-sdk-for-python + name = "azure" + envs = { + "AZURE_ENDPOINT": "https://api.translator.azure.cn", + "AZURE_API_KEY": None, + } + lang_map = {"zh": "zh-Hans"} + + def __init__(self, lang_in, lang_out, model, envs=None, **kwargs): + self.set_envs(envs) + super().__init__(lang_in, lang_out, model) + endpoint = self.envs["AZURE_ENDPOINT"] + api_key = self.envs["AZURE_API_KEY"] + credential = AzureKeyCredential(api_key) + self.client = TextTranslationClient( + endpoint=endpoint, credential=credential, region="chinaeast2" + ) + # https://github.com/Azure/azure-sdk-for-python/issues/9422 + logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy") + logger.setLevel(logging.WARNING) + + def do_translate(self, text) -> str: + response = self.client.translate( + body=[text], + from_language=self.lang_in, + to_language=[self.lang_out], + ) + translated_text = response[0].translations[0].text + return translated_text + + +class TencentTranslator(BaseTranslator): + # https://github.com/TencentCloud/tencentcloud-sdk-python + name = "tencent" + envs = { + "TENCENTCLOUD_SECRET_ID": None, + "TENCENTCLOUD_SECRET_KEY": None, + } + + def __init__(self, lang_in, lang_out, model, envs=None, **kwargs): + self.set_envs(envs) + super().__init__(lang_in, lang_out, model) + cred = credential.DefaultCredentialProvider().get_credential() + self.client = TmtClient(cred, "ap-beijing") + self.req = TextTranslateRequest() + self.req.Source = self.lang_in + self.req.Target = self.lang_out + self.req.ProjectId = 0 + + def do_translate(self, text): + self.req.SourceText = text + resp: TextTranslateResponse = self.client.TextTranslate(self.req) + return resp.TargetText + + +class AnythingLLMTranslator(BaseTranslator): + name = "anythingllm" + envs = { + "AnythingLLM_URL": None, + "AnythingLLM_APIKEY": None, + } + CustomPrompt = True + + def __init__(self, lang_out, lang_in, model, envs=None, prompt=None): + self.set_envs(envs) + super().__init__(lang_out, lang_in, model) + self.api_url = self.envs["AnythingLLM_URL"] + self.api_key = self.envs["AnythingLLM_APIKEY"] + self.headers = { + "accept": "application/json", + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + def do_translate(self, text): + messages = self.prompt(text, self.prompttext) + payload = { + "message": messages, + "mode": "chat", + "sessionId": "translation_expert", + } + + response = requests.post( + self.api_url, headers=self.headers, data=json.dumps(payload) + ) + response.raise_for_status() + data = response.json() + + if "textResponse" in data: + return data["textResponse"].strip() + + +class DifyTranslator(BaseTranslator): + name = "dify" + envs = { + "DIFY_API_URL": None, # 填写实际 Dify API 地址 + "DIFY_API_KEY": None, # 替换为实际 API 密钥 + } + + def __init__(self, lang_out, lang_in, model, envs=None, **kwargs): + self.set_envs(envs) + super().__init__(lang_out, lang_in, model) + self.api_url = self.envs["DIFY_API_URL"] + self.api_key = self.envs["DIFY_API_KEY"] + + def do_translate(self, text): + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + payload = { + "inputs": { + "lang_out": self.lang_out, + "lang_in": self.lang_in, + "text": text, + }, + "response_mode": "blocking", + "user": "translator-service", + } + + # 向 Dify 服务器发送请求 + response = requests.post( + self.api_url, headers=headers, data=json.dumps(payload) + ) + response.raise_for_status() + response_data = response.json() + + # 解析响应 + return response_data.get("data", {}).get("outputs", {}).get("text", []) + + +class ArgosTranslator(BaseTranslator): + name = "argos" + + def __init__(self, lang_in, lang_out, model, **kwargs): + super().__init__(lang_in, lang_out, model) + lang_in = self.lang_map.get(lang_in.lower(), lang_in) + lang_out = self.lang_map.get(lang_out.lower(), lang_out) + self.lang_in = lang_in + self.lang_out = lang_out + argostranslate.package.update_package_index() + available_packages = argostranslate.package.get_available_packages() + try: + available_package = list( + filter( + lambda x: x.from_code == self.lang_in + and x.to_code == self.lang_out, + available_packages, + ) + )[0] + except Exception: + raise ValueError( + "lang_in and lang_out pair not supported by Argos Translate." + ) + download_path = available_package.download() + argostranslate.package.install_from_path(download_path) + + def translate(self, text): + # Translate + installed_languages = argostranslate.translate.get_installed_languages() + from_lang = list(filter(lambda x: x.code == self.lang_in, installed_languages))[ + 0 + ] + to_lang = list(filter(lambda x: x.code == self.lang_out, installed_languages))[ + 0 + ] + translation = from_lang.get_translation(to_lang) + translatedText = translation.translate(text) + return translatedText + + +class GorkTranslator(OpenAITranslator): + # https://docs.x.ai/docs/overview#getting-started + name = "grok" + envs = { + "GORK_API_KEY": None, + "GORK_MODEL": "grok-2-1212", + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + base_url = "https://api.x.ai/v1" + api_key = self.envs["GORK_API_KEY"] + if not model: + model = self.envs["GORK_MODEL"] + super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + +class GroqTranslator(OpenAITranslator): + name = "groq" + envs = { + "GROQ_API_KEY": None, + "GROQ_MODEL": "llama-3-3-70b-versatile", + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + base_url = "https://api.groq.com/openai/v1" + api_key = self.envs["GROQ_API_KEY"] + if not model: + model = self.envs["GROQ_MODEL"] + super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + +class DeepseekTranslator(OpenAITranslator): + name = "deepseek" + envs = { + "DEEPSEEK_API_KEY": None, + "DEEPSEEK_MODEL": "deepseek-chat", + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + base_url = "https://api.deepseek.com/v1" + api_key = self.envs["DEEPSEEK_API_KEY"] + if not model: + model = self.envs["DEEPSEEK_MODEL"] + super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) + + +class OpenAIlikedTranslator(OpenAITranslator): + name = "openailiked" + envs = { + "OPENAILIKED_BASE_URL": None, + "OPENAILIKED_API_KEY": None, + "OPENAILIKED_MODEL": None, + } + CustomPrompt = True + + def __init__(self, lang_in, lang_out, model, envs=None, prompt=None): + self.set_envs(envs) + if self.envs["OPENAILIKED_BASE_URL"]: + base_url = self.envs["OPENAILIKED_BASE_URL"] + else: + raise ValueError("The OPENAILIKED_BASE_URL is missing.") + if not model: + if self.envs["OPENAILIKED_MODEL"]: + model = self.envs["OPENAILIKED_MODEL"] + else: + raise ValueError("The OPENAILIKED_MODEL is missing.") + if self.envs["OPENAILIKED_API_KEY"] is None: + api_key = "openailiked" + else: + api_key = self.envs["OPENAILIKED_API_KEY"] + super().__init__(lang_in, lang_out, model, base_url=base_url, api_key=api_key) + self.prompttext = prompt + if prompt: + self.add_cache_impact_parameters("prompt", prompt.template) diff --git a/pyproject.toml b/pyproject.toml index 07d837a9..b4b5976b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,64 +1,64 @@ -[project] -name = "pdf2zh" -version = "1.8.8" -description = "Latex PDF Translator" -authors = [{ name = "Byaidu", email = "byaidux@gmail.com" }] -license = "AGPL-3.0" -readme = "README.md" -requires-python = ">=3.9,<3.13" -classifiers = [ - "Programming Language :: Python :: 3", - "Operating System :: OS Independent", -] -dependencies = [ - "requests", - "pymupdf", - "tqdm", - "tenacity", - "numpy", - "ollama", - "xinference-client", - "deepl", - "openai", - "azure-ai-translation-text<=1.0.1", - "gradio", - "huggingface_hub", - "onnx", - "onnxruntime", - "opencv-python-headless", - "tencentcloud-sdk-python", - "pdfminer.six>=20240706", - "gradio_pdf>=0.0.21", - "pikepdf", - "peewee>=3.17.8", - "argostranslate", - "fontTools" -] - -[project.optional-dependencies] -dev = [ - "black", - "flake8", - "pre-commit", - "pytest", - "build" -] -backend = [ - "flask", - "celery", - "redis" -] - -[project.urls] -Homepage = "https://github.com/Byaidu/PDFMathTranslate" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[project.scripts] -pdf2zh = "pdf2zh.pdf2zh:main" - -[tool.flake8] -ignore = ["E203", "E261", "E501", "W503", "E741"] -max-line-length = 88 +[project] +name = "pdf2zh" +version = "1.8.8" +description = "Latex PDF Translator" +authors = [{ name = "Byaidu", email = "byaidux@gmail.com" }] +license = "AGPL-3.0" +readme = "README.md" +requires-python = ">=3.9,<3.13" +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dependencies = [ + "requests", + "pymupdf", + "tqdm", + "tenacity", + "numpy", + "ollama", + "xinference-client", + "deepl", + "openai", + "azure-ai-translation-text<=1.0.1", + "gradio", + "huggingface_hub", + "onnx", + "onnxruntime", + "opencv-python-headless", + "tencentcloud-sdk-python", + "pdfminer.six>=20240706", + "gradio_pdf>=0.0.21", + "pikepdf", + "peewee>=3.17.8", + "argostranslate", + "fontTools" +] + +[project.optional-dependencies] +dev = [ + "black", + "flake8", + "pre-commit", + "pytest", + "build" +] +backend = [ + "flask", + "celery", + "redis" +] + +[project.urls] +Homepage = "https://github.com/Byaidu/PDFMathTranslate" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project.scripts] +pdf2zh = "pdf2zh.pdf2zh:main" + +[tool.flake8] +ignore = ["E203", "E261", "E501", "W503", "E741"] +max-line-length = 88 diff --git a/script/setup.bat b/script/setup.bat index 5a22e425..3f86d25f 100644 --- a/script/setup.bat +++ b/script/setup.bat @@ -1,31 +1,26 @@ -@echo off -setlocal enabledelayedexpansion - -set PYTHON_URL=https://www.python.org/ftp/python/3.12.7/python-3.12.7-embed-amd64.zip -set PIP_URL=https://bootstrap.pypa.io/get-pip.py -set HF_ENDPOINT=https://hf-mirror.com -set PIP_MIRROR=https://mirrors.aliyun.com/pypi/simple - -if not exist pdf2zh_dist/python.exe ( - powershell -Command "& {Invoke-WebRequest -Uri !PYTHON_URL! -OutFile python.zip}" - powershell -Command "& {Expand-Archive -Path python.zip -DestinationPath pdf2zh_dist -Force}" - del python.zip - echo import site >> pdf2zh_dist/python312._pth -) -cd pdf2zh_dist - -if not exist Scripts/pip.exe ( - powershell -Command "& {Invoke-WebRequest -Uri !PIP_URL! -OutFile get-pip.py}" - python get-pip.py -) -path Scripts - -pip install --no-warn-script-location --upgrade pdf2zh -i !PIP_MIRROR! -pdf2zh --dry_run - -set "_root=%~dp0" -set "_root=%_root:~0,-1%" -cd "%_root%" -echo "%_root%" - -start "PDFMathTranslate" "%_root%\pdf2zh.exe" +@echo off +setlocal enabledelayedexpansion + +set PYTHON_URL=https://www.python.org/ftp/python/3.12.7/python-3.12.7-embed-amd64.zip +set PIP_URL=https://bootstrap.pypa.io/get-pip.py +set HF_ENDPOINT=https://hf-mirror.com +set PIP_MIRROR=https://mirrors.aliyun.com/pypi/simple + +if not exist pdf2zh_dist/python.exe ( + powershell -Command "& {Invoke-WebRequest -Uri !PYTHON_URL! -OutFile python.zip}" + powershell -Command "& {Expand-Archive -Path python.zip -DestinationPath pdf2zh_dist -Force}" + del python.zip + echo import site >> pdf2zh_dist/python312._pth +) +cd pdf2zh_dist + +if not exist Scripts/pip.exe ( + powershell -Command "& {Invoke-WebRequest -Uri !PIP_URL! -OutFile get-pip.py}" + python get-pip.py +) +path Scripts + +pip install --no-warn-script-location --upgrade pdf2zh -i !PIP_MIRROR! +pdf2zh -i + +pause diff --git a/test/test_translator.py b/test/test_translator.py index f50e38a9..874993c5 100644 --- a/test/test_translator.py +++ b/test/test_translator.py @@ -1,148 +1,148 @@ -import unittest -from pdf2zh.translator import BaseTranslator -from pdf2zh.translator import OpenAIlikedTranslator -from pdf2zh import cache -from pdf2zh.config import ConfigManager - - -class AutoIncreaseTranslator(BaseTranslator): - name = "auto_increase" - n = 0 - - def do_translate(self, text): - self.n += 1 - return str(self.n) - - -class TestTranslator(unittest.TestCase): - def setUp(self): - self.test_db = cache.init_test_db() - - def tearDown(self): - cache.clean_test_db(self.test_db) - - def test_cache(self): - translator = AutoIncreaseTranslator("en", "zh", "test") - # First translation should be cached - text = "Hello World" - first_result = translator.translate(text) - - # Second translation should return the same result from cache - second_result = translator.translate(text) - self.assertEqual(first_result, second_result) - - # Different input should give different result - different_text = "Different Text" - different_result = translator.translate(different_text) - self.assertNotEqual(first_result, different_result) - - # Test cache with ignore_cache=True - translator.ignore_cache = True - no_cache_result = translator.translate(text) - self.assertNotEqual(first_result, no_cache_result) - - def test_add_cache_impact_parameters(self): - translator = AutoIncreaseTranslator("en", "zh", "test") - - # Test cache with added parameters - text = "Hello World" - first_result = translator.translate(text) - translator.add_cache_impact_parameters("test", "value") - second_result = translator.translate(text) - self.assertNotEqual(first_result, second_result) - - # Test cache with ignore_cache=True - no_cache_result1 = translator.translate(text, ignore_cache=True) - self.assertNotEqual(first_result, no_cache_result1) - - translator.ignore_cache = True - no_cache_result2 = translator.translate(text) - self.assertNotEqual(no_cache_result1, no_cache_result2) - - # Test cache with ignore_cache=False - translator.ignore_cache = False - cache_result = translator.translate(text) - self.assertEqual(no_cache_result2, cache_result) - - # Test cache with another parameter - translator.add_cache_impact_parameters("test2", "value2") - another_result = translator.translate(text) - self.assertNotEqual(second_result, another_result) - - def test_base_translator_throw(self): - translator = BaseTranslator("en", "zh", "test") - with self.assertRaises(NotImplementedError): - translator.translate("Hello World") - - -class TestOpenAIlikedTranslator(unittest.TestCase): - def setUp(self) -> None: - self.default_envs = { - "OPENAILIKED_BASE_URL": "https://api.openailiked.com", - "OPENAILIKED_API_KEY": "test_api_key", - "OPENAILIKED_MODEL": "test_model", - } - - def test_missing_base_url_raises_error(self): - """测试缺失 OPENAILIKED_BASE_URL 时抛出异常""" - ConfigManager.clear() - with self.assertRaises(ValueError) as context: - OpenAIlikedTranslator( - lang_in="en", lang_out="zh", model="test_model", envs={} - ) - self.assertIn("The OPENAILIKED_BASE_URL is missing.", str(context.exception)) - - def test_missing_model_raises_error(self): - """测试缺失 OPENAILIKED_MODEL 时抛出异常""" - envs_without_model = { - "OPENAILIKED_BASE_URL": "https://api.openailiked.com", - "OPENAILIKED_API_KEY": "test_api_key", - } - ConfigManager.clear() - with self.assertRaises(ValueError) as context: - OpenAIlikedTranslator( - lang_in="en", lang_out="zh", model=None, envs=envs_without_model - ) - self.assertIn("The OPENAILIKED_MODEL is missing.", str(context.exception)) - - def test_initialization_with_valid_envs(self): - """测试使用有效的环境变量初始化""" - ConfigManager.clear() - translator = OpenAIlikedTranslator( - lang_in="en", - lang_out="zh", - model=None, - envs=self.default_envs, - ) - self.assertEqual( - translator.envs["OPENAILIKED_BASE_URL"], - self.default_envs["OPENAILIKED_BASE_URL"], - ) - self.assertEqual( - translator.envs["OPENAILIKED_API_KEY"], - self.default_envs["OPENAILIKED_API_KEY"], - ) - self.assertEqual(translator.model, self.default_envs["OPENAILIKED_MODEL"]) - - def test_default_api_key_fallback(self): - """测试当 OPENAILIKED_API_KEY 为空时使用默认值""" - envs_without_key = { - "OPENAILIKED_BASE_URL": "https://api.openailiked.com", - "OPENAILIKED_MODEL": "test_model", - } - ConfigManager.clear() - translator = OpenAIlikedTranslator( - lang_in="en", - lang_out="zh", - model=None, - envs=envs_without_key, - ) - self.assertEqual( - translator.envs["OPENAILIKED_BASE_URL"], - self.default_envs["OPENAILIKED_BASE_URL"], - ) - self.assertEqual(translator.envs["OPENAILIKED_API_KEY"], None) - - -if __name__ == "__main__": - unittest.main() +import unittest +from pdf2zh.translator import BaseTranslator +from pdf2zh.translator import OpenAIlikedTranslator +from pdf2zh import cache +from pdf2zh.config import ConfigManager + + +class AutoIncreaseTranslator(BaseTranslator): + name = "auto_increase" + n = 0 + + def do_translate(self, text): + self.n += 1 + return str(self.n) + + +class TestTranslator(unittest.TestCase): + def setUp(self): + self.test_db = cache.init_test_db() + + def tearDown(self): + cache.clean_test_db(self.test_db) + + def test_cache(self): + translator = AutoIncreaseTranslator("en", "zh", "test") + # First translation should be cached + text = "Hello World" + first_result = translator.translate(text) + + # Second translation should return the same result from cache + second_result = translator.translate(text) + self.assertEqual(first_result, second_result) + + # Different input should give different result + different_text = "Different Text" + different_result = translator.translate(different_text) + self.assertNotEqual(first_result, different_result) + + # Test cache with ignore_cache=True + translator.ignore_cache = True + no_cache_result = translator.translate(text) + self.assertNotEqual(first_result, no_cache_result) + + def test_add_cache_impact_parameters(self): + translator = AutoIncreaseTranslator("en", "zh", "test") + + # Test cache with added parameters + text = "Hello World" + first_result = translator.translate(text) + translator.add_cache_impact_parameters("test", "value") + second_result = translator.translate(text) + self.assertNotEqual(first_result, second_result) + + # Test cache with ignore_cache=True + no_cache_result1 = translator.translate(text, ignore_cache=True) + self.assertNotEqual(first_result, no_cache_result1) + + translator.ignore_cache = True + no_cache_result2 = translator.translate(text) + self.assertNotEqual(no_cache_result1, no_cache_result2) + + # Test cache with ignore_cache=False + translator.ignore_cache = False + cache_result = translator.translate(text) + self.assertEqual(no_cache_result2, cache_result) + + # Test cache with another parameter + translator.add_cache_impact_parameters("test2", "value2") + another_result = translator.translate(text) + self.assertNotEqual(second_result, another_result) + + def test_base_translator_throw(self): + translator = BaseTranslator("en", "zh", "test") + with self.assertRaises(NotImplementedError): + translator.translate("Hello World") + + +class TestOpenAIlikedTranslator(unittest.TestCase): + def setUp(self) -> None: + self.default_envs = { + "OPENAILIKED_BASE_URL": "https://api.openailiked.com", + "OPENAILIKED_API_KEY": "test_api_key", + "OPENAILIKED_MODEL": "test_model", + } + + def test_missing_base_url_raises_error(self): + """测试缺失 OPENAILIKED_BASE_URL 时抛出异常""" + ConfigManager.clear() + with self.assertRaises(ValueError) as context: + OpenAIlikedTranslator( + lang_in="en", lang_out="zh", model="test_model", envs={} + ) + self.assertIn("The OPENAILIKED_BASE_URL is missing.", str(context.exception)) + + def test_missing_model_raises_error(self): + """测试缺失 OPENAILIKED_MODEL 时抛出异常""" + envs_without_model = { + "OPENAILIKED_BASE_URL": "https://api.openailiked.com", + "OPENAILIKED_API_KEY": "test_api_key", + } + ConfigManager.clear() + with self.assertRaises(ValueError) as context: + OpenAIlikedTranslator( + lang_in="en", lang_out="zh", model=None, envs=envs_without_model + ) + self.assertIn("The OPENAILIKED_MODEL is missing.", str(context.exception)) + + def test_initialization_with_valid_envs(self): + """测试使用有效的环境变量初始化""" + ConfigManager.clear() + translator = OpenAIlikedTranslator( + lang_in="en", + lang_out="zh", + model=None, + envs=self.default_envs, + ) + self.assertEqual( + translator.envs["OPENAILIKED_BASE_URL"], + self.default_envs["OPENAILIKED_BASE_URL"], + ) + self.assertEqual( + translator.envs["OPENAILIKED_API_KEY"], + self.default_envs["OPENAILIKED_API_KEY"], + ) + self.assertEqual(translator.model, self.default_envs["OPENAILIKED_MODEL"]) + + def test_default_api_key_fallback(self): + """测试当 OPENAILIKED_API_KEY 为空时使用默认值""" + envs_without_key = { + "OPENAILIKED_BASE_URL": "https://api.openailiked.com", + "OPENAILIKED_MODEL": "test_model", + } + ConfigManager.clear() + translator = OpenAIlikedTranslator( + lang_in="en", + lang_out="zh", + model=None, + envs=envs_without_key, + ) + self.assertEqual( + translator.envs["OPENAILIKED_BASE_URL"], + self.default_envs["OPENAILIKED_BASE_URL"], + ) + self.assertEqual(translator.envs["OPENAILIKED_API_KEY"], None) + + +if __name__ == "__main__": + unittest.main()