-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1063 from opendatalab/release-0.10.0
Release 0.10.0
- Loading branch information
Showing
110 changed files
with
25,715 additions
and
2,268 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
"""span维度自定义字段.""" | ||
# span是否是跨页合并的 | ||
CROSS_PAGE = 'cross_page' | ||
|
||
""" | ||
block维度自定义字段 | ||
""" | ||
# block中lines是否被删除 | ||
LINES_DELETED = 'lines_deleted' | ||
|
||
# table recognition max time default value | ||
TABLE_MAX_TIME_VALUE = 400 | ||
|
||
# pp_table_result_max_length | ||
TABLE_MAX_LEN = 480 | ||
|
||
# table master structure dict | ||
TABLE_MASTER_DICT = 'table_master_structure_dict.txt' | ||
|
||
# table master dir | ||
TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/' | ||
|
||
# pp detect model dir | ||
DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer' | ||
|
||
# pp rec model dir | ||
REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer' | ||
|
||
# pp rec char dict path | ||
REC_CHAR_DICT = 'ppocr_keys_v1.txt' | ||
|
||
# pp rec copy rec directory | ||
PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer' | ||
|
||
# pp rec copy det directory | ||
PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer' | ||
|
||
|
||
class MODEL_NAME: | ||
# pp table structure algorithm | ||
TABLE_MASTER = 'tablemaster' | ||
# struct eqtable | ||
STRUCT_EQTABLE = 'struct_eqtable' | ||
|
||
DocLayout_YOLO = 'doclayout_yolo' | ||
|
||
LAYOUTLMv3 = 'layoutlmv3' | ||
|
||
YOLO_V8_MFD = 'yolo_v8_mfd' | ||
|
||
UniMerNet_v2_Small = 'unimernet_small' | ||
|
||
RAPID_TABLE = 'rapid_table' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
class DropReason: | ||
TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序 | ||
USEFUL_BLOCK_HOR_OVERLAP = ( | ||
'useful_block_horizontal_overlap' # 需保留的block水平覆盖 | ||
) | ||
COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持 | ||
TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的 | ||
COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。 | ||
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = ( | ||
'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃 | ||
) | ||
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = ( | ||
'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃 | ||
) | ||
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大 | ||
MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败 | ||
Exception = '_exception' # 解析中发生异常 | ||
ENCRYPTED = 'encrypted' # PDF是加密的 | ||
EMPTY_PDF = 'total_page=0' # PDF页面总数为0 | ||
NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析 | ||
DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段 | ||
TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败 | ||
TITLE_LEVEL_FAILED = ( | ||
'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题) | ||
) | ||
PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败 | ||
PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败 | ||
NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种 | ||
SPECIAL_PDF = 'special_pdf' | ||
PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏 | ||
CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面 | ||
NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负 | ||
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = ( | ||
'overlap_blocks_can_t_separation' # 无法分离重叠的block | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
|
||
COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block' | ||
PAGE_NO = 'page-no' # 页码 | ||
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本 | ||
VERTICAL_TEXT = 'vertical-text' # 垂直文本 | ||
ROTATE_TEXT = 'rotate-text' # 旋转文本 | ||
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block | ||
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上 | ||
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上 | ||
|
||
|
||
class DropTag: | ||
PAGE_NUMBER = 'page_no' | ||
HEADER = 'header' | ||
FOOTER = 'footer' | ||
FOOTNOTE = 'footnote' | ||
NOT_IN_LAYOUT = 'not_in_layout' | ||
SPAN_OVERLAP = 'span_overlap' | ||
BLOCK_OVERLAP = 'block_overlap' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
class MakeMode: | ||
MM_MD = 'mm_markdown' | ||
NLP_MD = 'nlp_markdown' | ||
STANDARD_FORMAT = 'standard_format' | ||
|
||
|
||
class DropMode: | ||
WHOLE_PDF = 'whole_pdf' | ||
SINGLE_PAGE = 'single_page' | ||
NONE = 'none' | ||
NONE_WITH_REASON = 'none_with_reason' |
3 changes: 2 additions & 1 deletion
3
magic_pdf/libs/ModelBlockTypeEnum.py → magic_pdf/config/model_block_type.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,10 @@ | ||
from enum import Enum | ||
|
||
|
||
class ModelBlockTypeEnum(Enum): | ||
TITLE = 0 | ||
PLAIN_TEXT = 1 | ||
ABANDON = 2 | ||
ISOLATE_FORMULA = 8 | ||
EMBEDDING = 13 | ||
ISOLATED = 14 | ||
ISOLATED = 14 |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.