diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index 828196b7ba..b759c6d6a1 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -104,6 +104,9 @@ def handle_pdf_content(file, pdf_document): content += page_content + # Null characters are not allowed. + content = content.replace('\0', '') + elapsed_time = time.time() - start_time max_kb.debug( f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}") @@ -156,6 +159,10 @@ def handle_toc(doc, limit): text = text[:idx] chapter_text += text # 提取文本 + + # Null characters are not allowed. + chapter_text = chapter_text.replace('\0', '') + # 限制章节内容长度 if 0 < limit < len(chapter_text): split_text = PdfSplitHandle.split_text(chapter_text, limit) @@ -228,6 +235,9 @@ def handle_links(doc, pattern_list, with_filter, limit): text = text[:idx] chapter_text += text + # Null characters are not allowed. + chapter_text = chapter_text.replace('\0', '') + # 限制章节内容长度 if 0 < limit < len(chapter_text): split_text = PdfSplitHandle.split_text(chapter_text, limit)