Merge pull request #1063 from opendatalab/release-0.10.0

Release 0.10.0
opendatalab · Nov 22, 2024 · 158e556 · 158e556
2 parents 038f48d + 30be501
commit 158e556
Show file tree

Hide file tree

Showing 110 changed files with 25,715 additions and 2,268 deletions.
diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
@@ -3,13 +3,6 @@
 
 name: mineru
 on:
-  push:
-    branches:
-      - "master"
-      - "dev"
-    paths-ignore:
-      - "cmds/**"
-      - "**.md"
   pull_request:
     branches:
       - "master"

diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
@@ -20,6 +20,7 @@ jobs:
         source activate mineru
         conda env list
         pip show coverage
+        git checkout "dev"
         # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
         cd $GITHUB_WORKSPACE && python tests/clean_coverage.py      
         cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/  --cov-report html --cov-report term-missing

diff --git a/.github/workflows/huigui.yml b/.github/workflows/huigui.yml
@@ -10,7 +10,6 @@ on:
     paths-ignore:
       - "cmds/**"
       - "**.md"
-  workflow_dispatch:
 jobs:
   cli-test:
     if: github.repository == 'opendatalab/MinerU'

diff --git a/README.md b/README.md
@@ -42,6 +42,9 @@
 </div>
 
 # Changelog
+- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
+  - Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
+  - Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
 - 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
 - 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
 - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -42,6 +42,9 @@
 </div>
 
 # 更新记录
+- 2024/11/22 0.10.0发布，通过引入混合OCR文本提取能力，
+  - 在公式密集、span区域不规范、部分文本使用图像表现等复杂文本分布场景下获得解析效果的显著提升
+  - 同时具备文本模式内容提取准确、速度更快与OCR模式span/line区域识别更准的双重优势
 - 2024/11/15 0.9.3发布，为表格识别功能接入了[RapidTable](https://github.com/RapidAI/RapidTable),单表解析速度提升10倍以上，准确率更高，显存占用更低
 - 2024/11/06 0.9.2发布，为表格识别功能接入了[StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B)模型
 - 2024/10/31 0.9.0发布，这是我们进行了大量代码重构的全新版本，解决了众多问题，提升了性能，降低了硬件需求，并提供了更丰富的易用性：

diff --git a/magic_pdf/config/constants.py b/magic_pdf/config/constants.py
@@ -0,0 +1,53 @@
+"""span维度自定义字段."""
+# span是否是跨页合并的
+CROSS_PAGE = 'cross_page'
+
+"""
+block维度自定义字段
+"""
+# block中lines是否被删除
+LINES_DELETED = 'lines_deleted'
+
+# table recognition max time default value
+TABLE_MAX_TIME_VALUE = 400
+
+# pp_table_result_max_length
+TABLE_MAX_LEN = 480
+
+# table master structure dict
+TABLE_MASTER_DICT = 'table_master_structure_dict.txt'
+
+# table master dir
+TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'
+
+# pp detect model dir
+DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'
+
+# pp rec model dir
+REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'
+
+# pp rec char dict path
+REC_CHAR_DICT = 'ppocr_keys_v1.txt'
+
+# pp rec copy rec directory
+PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
+
+# pp rec copy det directory
+PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
+
+
+class MODEL_NAME:
+    # pp table structure algorithm
+    TABLE_MASTER = 'tablemaster'
+    # struct eqtable
+    STRUCT_EQTABLE = 'struct_eqtable'
+
+    DocLayout_YOLO = 'doclayout_yolo'
+
+    LAYOUTLMv3 = 'layoutlmv3'
+
+    YOLO_V8_MFD = 'yolo_v8_mfd'
+
+    UniMerNet_v2_Small = 'unimernet_small'
+
+    RAPID_TABLE = 'rapid_table'
diff --git a/magic_pdf/config/drop_reason.py b/magic_pdf/config/drop_reason.py
@@ -0,0 +1,35 @@
+class DropReason:
+    TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap'  # 文字块有水平互相覆盖，导致无法准确定位文字顺序
+    USEFUL_BLOCK_HOR_OVERLAP = (
+        'useful_block_horizontal_overlap'  # 需保留的block水平覆盖
+    )
+    COMPLICATED_LAYOUT = 'complicated_layout'  # 复杂的布局，暂时不支持
+    TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns'  # 目前不支持分栏超过2列的
+    COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box'  # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
+    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
+        'high_computational_load_by_imgs'  # 含特殊图片，计算量太大，从而丢弃
+    )
+    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
+        'high_computational_load_by_svgs'  # 特殊的SVG图，计算量太大，从而丢弃
+    )
+    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages'  # 计算量超过负荷，当前方法下计算量消耗过大
+    MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result'  # 版面分析失败
+    Exception = '_exception'  # 解析中发生异常
+    ENCRYPTED = 'encrypted'  # PDF是加密的
+    EMPTY_PDF = 'total_page=0'  # PDF页面总数为0
+    NOT_IS_TEXT_PDF = 'not_is_text_pdf'  # 不是文字版PDF，无法直接解析
+    DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block'  # 无法清晰的分段
+    TITLE_DETECTION_FAILED = 'title_detection_failed'  # 探测标题失败
+    TITLE_LEVEL_FAILED = (
+        'title_level_failed'  # 分析标题级别失败（例如一级、二级、三级标题）
+    )
+    PARA_SPLIT_FAILED = 'para_split_failed'  # 识别段落失败
+    PARA_MERGE_FAILED = 'para_merge_failed'  # 段落合并失败
+    NOT_ALLOW_LANGUAGE = 'not_allow_language'  # 不支持的语种
+    SPECIAL_PDF = 'special_pdf'
+    PSEUDO_SINGLE_COLUMN = 'pseudo_single_column'  # 无法精确判断文字分栏
+    CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout'  # 无法分析页面的版面
+    NEGATIVE_BBOX_AREA = 'negative_bbox_area'  # 缩放导致 bbox 面积为负
+    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
+        'overlap_blocks_can_t_separation'  # 无法分离重叠的block
+    )
diff --git a/magic_pdf/config/drop_tag.py b/magic_pdf/config/drop_tag.py
@@ -0,0 +1,19 @@
+
+COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
+PAGE_NO = 'page-no'  # 页码
+CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area'  # 页眉页脚内的文本
+VERTICAL_TEXT = 'vertical-text'  # 垂直文本
+ROTATE_TEXT = 'rotate-text'  # 旋转文本
+EMPTY_SIDE_BLOCK = 'empty-side-block'  # 边缘上的空白没有任何内容的block
+ON_IMAGE_TEXT = 'on-image-text'  # 文本在图片上
+ON_TABLE_TEXT = 'on-table-text'  # 文本在表格上
+
+
+class DropTag:
+    PAGE_NUMBER = 'page_no'
+    HEADER = 'header'
+    FOOTER = 'footer'
+    FOOTNOTE = 'footnote'
+    NOT_IN_LAYOUT = 'not_in_layout'
+    SPAN_OVERLAP = 'span_overlap'
+    BLOCK_OVERLAP = 'block_overlap'
diff --git a/magic_pdf/config/make_content_config.py b/magic_pdf/config/make_content_config.py
@@ -0,0 +1,11 @@
+class MakeMode:
+    MM_MD = 'mm_markdown'
+    NLP_MD = 'nlp_markdown'
+    STANDARD_FORMAT = 'standard_format'
+
+
+class DropMode:
+    WHOLE_PDF = 'whole_pdf'
+    SINGLE_PAGE = 'single_page'
+    NONE = 'none'
+    NONE_WITH_REASON = 'none_with_reason'
diff --git a/magic_pdf/libs/ModelBlockTypeEnum.py → magic_pdf/config/model_block_type.py b/magic_pdf/libs/ModelBlockTypeEnum.py → magic_pdf/config/model_block_type.py
@@ -1,9 +1,10 @@
 from enum import Enum
 
+
 class ModelBlockTypeEnum(Enum):
     TITLE = 0
     PLAIN_TEXT = 1
     ABANDON = 2
     ISOLATE_FORMULA = 8
     EMBEDDING = 13
-    ISOLATED = 14
+    ISOLATED = 14
diff --git a/magic_pdf/libs/ocr_content_type.py → magic_pdf/config/ocr_content_type.py b/magic_pdf/libs/ocr_content_type.py → magic_pdf/config/ocr_content_type.py
diff --git a/magic_pdf/data/read_api.py b/magic_pdf/data/read_api.py
@@ -35,7 +35,7 @@ def read_jsonl(
     jsonl_d = [
         json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
     ]
-    for d in jsonl_d[:5]:
+    for d in jsonl_d:
         pdf_path = d.get('file_location', '') or d.get('path', '')
         if len(pdf_path) == 0:
             raise EmptyData('pdf file location is empty')