diff --git a/.github/workflows/black.format.yml b/.github/workflows/black.format.yml new file mode 100644 index 00000000..661a0719 --- /dev/null +++ b/.github/workflows/black.format.yml @@ -0,0 +1,10 @@ +name: Format Code with Black + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: psf/black@stable \ No newline at end of file diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml index d4dd3c0c..a82fa28c 100644 --- a/.github/workflows/python-build.yml +++ b/.github/workflows/python-build.yml @@ -19,17 +19,17 @@ jobs: - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install build flake8 black + pip3 install -U pip + pip3 install -e .[dev] - name: Test - Code format run: | black --check --diff --color pdf2zh/*.py flake8 --ignore E203,E261,E501,W503,E741 - - - name: Test - Local installation - run: - python -m pip install -e . + + - name: Test - Unit Test + run: | + pytest . - name: Test - Translate a PDF file with plain text only run: diff --git a/README.md b/README.md index 96549c30..492c1a45 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Note that the computing resources of the demo are limited, so please avoid abusi For different use cases, we provide four distinct methods to use our program: -
+
1. Commandline 1. Python installed (3.8 <= version <= 3.12) @@ -158,9 +158,9 @@ For docker deployment on cloud service: The present program needs an AI model(`wybxc/DocLayout-YOLO-DocStructBench-onnx`) before working and some users are not able to download due to network issues. If you have a problem with downloading this model, we provide a workaround using the following environment variable: - ```shell - USE_MODELSCOPE=1 pdf2zh - ``` +```shell +set HF_ENDPOINT=https://hf-mirror.com +``` If the solution does not work to you / you encountered other issues, please refer to [frequently asked questions](https://github.com/Byaidu/PDFMathTranslate/wiki#-faq--%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98). diff --git a/docs/APIS.md b/docs/APIS.md index 8f9ee170..6ece971d 100644 --- a/docs/APIS.md +++ b/docs/APIS.md @@ -22,13 +22,16 @@ params = { 'lang_out': 'zh', 'service': 'google', 'thread': 4, - } +} +``` +Translate with files: +```python (file_mono, file_dual) = translate(files=['example.pdf'], **params)[0] - +``` +Translate with stream: +```python with open('example.pdf', 'rb') as f: - (stream_mono, stream_dual) = translate_stream(stream=f.read(), - **params) - + (stream_mono, stream_dual) = translate_stream(stream=f.read(), **params) ``` [⬆️ Back to top](#toc) @@ -39,7 +42,7 @@ with open('example.pdf', 'rb') as f: In a more flexible way, you can communicate with the program using HTTP protocols, if: -1. You have the backend installed & running +1. Install and run backend ```bash pip install pdf2zh[backend] @@ -49,7 +52,7 @@ In a more flexible way, you can communicate with the program using HTTP protocol 2. Using HTTP protocols as follows: - - Translate + - Submit translate task ```bash curl http://localhost:11008/v1/translate -F "file=@example.pdf" -F "data={\"lang_in\":\"en\",\"lang_out\":\"zh\",\"service\":\"google\",\"thread\":4}" @@ -70,19 +73,19 @@ In a more flexible way, you can communicate with the program using HTTP protocol {"state":"SUCCESS"} ``` - - Specifying output + - Save monolingual file ```bash curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/mono --output example-mono.pdf ``` - - Specifying the output as a bilingual file + - Save bilingual file ```bash curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a/dual --output example-dual.pdf ``` - - Or delete it after the whole process + - Interrupt if running and delete the task ```bash curl http://localhost:11008/v1/translate/d9894125-2f4e-45ea-9d93-1a9068d2045a -X DELETE ``` diff --git a/docs/README_ja-JP.md b/docs/README_ja-JP.md index 8e36893b..d88912e3 100644 --- a/docs/README_ja-JP.md +++ b/docs/README_ja-JP.md @@ -77,7 +77,7 @@ pdf2zhの実行には追加モデル(`wybxc/DocLayout-YOLO-DocStructBench-onnx`)が必要です。このモデルはModelScopeでも見つけることができます。起動時にこのモデルのダウンロードに問題がある場合は、以下の環境変数を使用してください: ```shell -USE_MODELSCOPE=1 pdf2zh +set HF_ENDPOINT=https://hf-mirror.com ```

方法1. コマンドライン

diff --git a/docs/README_zh-CN.md b/docs/README_zh-CN.md index 0d7b23b8..42e273b2 100644 --- a/docs/README_zh-CN.md +++ b/docs/README_zh-CN.md @@ -76,7 +76,7 @@ pdf2zh的运行依赖于额外模型(`wybxc/DocLayout-YOLO-DocStructBench-onnx`),该模型在魔搭上也可以找到。如果你在启动时下载该模型遇到问题,请使用如下环境变量: ```shell -USE_MODELSCOPE=1 pdf2zh +set HF_ENDPOINT=https://hf-mirror.com ```

方法一、命令行工具

diff --git a/pdf2zh/high_level.py b/pdf2zh/high_level.py index 0e86e5cb..41d760d4 100644 --- a/pdf2zh/high_level.py +++ b/pdf2zh/high_level.py @@ -188,7 +188,7 @@ def translate_stream( elif lang_out.lower() in noto_list: # noto resfont = "noto" # docker - ttf_path = '/app/GoNotoKurrent-Regular.ttf' + ttf_path = "/app/GoNotoKurrent-Regular.ttf" if not os.path.exists(ttf_path): ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf") if not os.path.exists(ttf_path): @@ -297,7 +297,10 @@ def translate( doc_raw = open(file, "rb") s_raw = doc_raw.read() s_mono, s_dual = translate_stream( - s_raw, envs=kwarg.get("envs", {}), prompt=kwarg.get("prompt", []), **locals() + s_raw, + envs=kwarg.get("envs", {}), + prompt=kwarg.get("prompt", []), + **locals(), ) file_mono = Path(output) / f"{filename}-mono.pdf" file_dual = Path(output) / f"{filename}-dual.pdf" diff --git a/pyproject.toml b/pyproject.toml index c8bdefa5..82e334fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,9 @@ dependencies = [ dev = [ "black", "flake8", - "pre-commit" + "pre-commit", + "pytest", + "build" ] backend = [ "flask", diff --git a/test/test_converter.py b/test/test_converter.py new file mode 100644 index 00000000..dbc57f95 --- /dev/null +++ b/test/test_converter.py @@ -0,0 +1,132 @@ +import unittest +from unittest.mock import Mock, patch, MagicMock +from pdfminer.layout import LTPage, LTChar, LTLine +from pdfminer.pdfinterp import PDFResourceManager +from pdf2zh.converter import PDFConverterEx, TranslateConverter + + +class TestPDFConverterEx(unittest.TestCase): + def setUp(self): + self.rsrcmgr = PDFResourceManager() + self.converter = PDFConverterEx(self.rsrcmgr) + + def test_begin_page(self): + mock_page = Mock() + mock_page.pageno = 1 + mock_page.cropbox = (0, 0, 100, 200) + mock_ctm = [1, 0, 0, 1, 0, 0] + self.converter.begin_page(mock_page, mock_ctm) + self.assertIsNotNone(self.converter.cur_item) + self.assertEqual(self.converter.cur_item.pageid, 1) + + def test_render_char(self): + mock_matrix = (1, 2, 3, 4, 5, 6) + mock_font = Mock() + mock_font.to_unichr.return_value = "A" + mock_font.char_width.return_value = 10 + mock_font.char_disp.return_value = (0, 0) + graphic_state = Mock() + self.converter.cur_item = Mock() + result = self.converter.render_char( + mock_matrix, + mock_font, + fontsize=12, + scaling=1.0, + rise=0, + cid=65, + ncs=None, + graphicstate=graphic_state, + ) + self.assertEqual(result, 120.0) # Expected text width + + +class TestTranslateConverter(unittest.TestCase): + def setUp(self): + self.rsrcmgr = PDFResourceManager() + self.layout = {1: Mock()} + self.translator_class = Mock() + self.converter = TranslateConverter( + self.rsrcmgr, + layout=self.layout, + lang_in="en", + lang_out="zh", + service="google", + ) + + def test_translator_initialization(self): + self.assertIsNotNone(self.converter.translator) + self.assertEqual(self.converter.translator.lang_in, "en") + self.assertEqual(self.converter.translator.lang_out, "zh-CN") + + @patch("pdf2zh.converter.TranslateConverter.receive_layout") + def test_receive_layout(self, mock_receive_layout): + mock_page = LTPage(1, (0, 0, 100, 200)) + mock_font = Mock() + mock_font.fontname.return_value = "mock_font" + mock_page.add( + LTChar( + matrix=(1, 2, 3, 4, 5, 6), + font=mock_font, + fontsize=12, + scaling=1.0, + rise=0, + text="A", + textwidth=10, + textdisp=(1.0, 1.0), + ncs=Mock(), + graphicstate=Mock(), + ) + ) + self.converter.receive_layout(mock_page) + mock_receive_layout.assert_called_once_with(mock_page) + + @patch("concurrent.futures.ThreadPoolExecutor") + @patch("pdf2zh.cache") + def test_translation(self, mock_cache, mock_executor): + mock_executor.return_value.__enter__.return_value.map.return_value = [ + "你好", + "{v1}", + ] + mock_cache.deterministic_hash.return_value = "test_hash" + mock_cache.load_paragraph.return_value = None + mock_cache.write_paragraph.return_value = None + + sstk = ["Hello", "{v1}"] + self.converter.thread = 2 + results = [] + with patch.object(self.converter, "translator") as mock_translator: + mock_translator.translate.side_effect = lambda x: ( + "你好" if x == "Hello" else x + ) + for s in sstk: + results.append(self.converter.translator.translate(s)) + self.assertEqual(results, ["你好", "{v1}"]) + + def test_receive_layout_with_complex_formula(self): + ltpage = LTPage(1, (0, 0, 500, 500)) + ltchar = Mock() + ltchar.fontname.return_value = "mock_font" + ltline = LTLine(0.1, (0, 0), (10, 20)) + ltpage.add(ltchar) + ltpage.add(ltline) + mock_layout = MagicMock() + mock_layout.shape = (100, 100) + mock_layout.__getitem__.return_value = -1 + self.converter.layout = [None, mock_layout] + self.converter.thread = 1 + result = self.converter.receive_layout(ltpage) + self.assertIsNotNone(result) + + def test_invalid_translation_service(self): + with self.assertRaises(ValueError): + TranslateConverter( + self.rsrcmgr, + layout=self.layout, + lang_in="en", + lang_out="zh", + service="InvalidService", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_doclayout.py b/test/test_doclayout.py new file mode 100644 index 00000000..3add5618 --- /dev/null +++ b/test/test_doclayout.py @@ -0,0 +1,105 @@ +import unittest +from unittest.mock import patch, MagicMock +import numpy as np +from pdf2zh.doclayout import ( + OnnxModel, + YoloResult, + YoloBox, +) + + +class TestOnnxModel(unittest.TestCase): + @patch("onnx.load") + @patch("onnxruntime.InferenceSession") + def setUp(self, mock_inference_session, mock_onnx_load): + # Mock ONNX model metadata + mock_model = MagicMock() + mock_model.metadata_props = [ + MagicMock(key="stride", value="32"), + MagicMock(key="names", value="['class1', 'class2']"), + ] + mock_onnx_load.return_value = mock_model + + # Initialize OnnxModel with a fake path + self.model_path = "fake_model_path.onnx" + self.model = OnnxModel(self.model_path) + + def test_stride_property(self): + # Test that stride is correctly set from model metadata + self.assertEqual(self.model.stride, 32) + + def test_resize_and_pad_image(self): + # Create a dummy image (100x200) + image = np.ones((100, 200, 3), dtype=np.uint8) + resized_image = self.model.resize_and_pad_image(image, 1024) + + # Validate the output shape + self.assertEqual(resized_image.shape[0], 512) + self.assertEqual(resized_image.shape[1], 1024) + + # Check that padding has been added + padded_height = resized_image.shape[0] - image.shape[0] + padded_width = resized_image.shape[1] - image.shape[1] + self.assertGreater(padded_height, 0) + self.assertGreater(padded_width, 0) + + def test_scale_boxes(self): + img1_shape = (1024, 1024) # Model input shape + img0_shape = (500, 300) # Original image shape + boxes = np.array([[512, 512, 768, 768]]) # Example bounding box + + scaled_boxes = self.model.scale_boxes(img1_shape, boxes, img0_shape) + + # Verify the output is scaled correctly + self.assertEqual(scaled_boxes.shape, boxes.shape) + self.assertTrue(np.all(scaled_boxes <= max(img0_shape))) + + def test_predict(self): + # Mock model inference output + mock_output = np.random.random((1, 300, 6)) + self.model.model.run.return_value = [mock_output] + + # Create a dummy image + image = np.ones((500, 300, 3), dtype=np.uint8) + + results = self.model.predict(image) + + # Validate predictions + self.assertEqual(len(results), 1) + self.assertIsInstance(results[0], YoloResult) + self.assertGreater(len(results[0].boxes), 0) + self.assertIsInstance(results[0].boxes[0], YoloBox) + + +class TestYoloResult(unittest.TestCase): + def test_yolo_result(self): + # Example prediction data + boxes = [ + [100, 200, 300, 400, 0.9, 0], + [50, 100, 150, 200, 0.8, 1], + ] + names = ["class1", "class2"] + + result = YoloResult(boxes, names) + + # Validate the number of boxes and their order by confidence + self.assertEqual(len(result.boxes), 2) + self.assertGreater(result.boxes[0].conf, result.boxes[1].conf) + self.assertEqual(result.names, names) + + +class TestYoloBox(unittest.TestCase): + def test_yolo_box(self): + # Example box data + box_data = [100, 200, 300, 400, 0.9, 0] + + box = YoloBox(box_data) + + # Validate box properties + self.assertEqual(box.xyxy, box_data[:4]) + self.assertEqual(box.conf, box_data[4]) + self.assertEqual(box.cls, box_data[5]) + + +if __name__ == "__main__": + unittest.main()