Skip to content

Commit

Permalink
fix: 处理PDF中出现 \0 字符报 Null characters are not allowed
Browse files Browse the repository at this point in the history
--bug=1048190 --user=刘瑞斌 【知识库】- 上传PDF文档 报错  ,关联issue #1468 https://www.tapd.cn/57709429/s/1611070
  • Loading branch information
liuruibin committed Nov 18, 2024
1 parent 4dd497e commit 636d900
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions apps/common/handle/impl/pdf_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ def handle_pdf_content(file, pdf_document):

content += page_content

# Null characters are not allowed.
content = content.replace('\0', '')

elapsed_time = time.time() - start_time
max_kb.debug(
f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}")
Expand Down Expand Up @@ -156,6 +159,10 @@ def handle_toc(doc, limit):
text = text[:idx]

chapter_text += text # 提取文本

# Null characters are not allowed.
chapter_text = chapter_text.replace('\0', '')

# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
Expand Down Expand Up @@ -228,6 +235,9 @@ def handle_links(doc, pattern_list, with_filter, limit):
text = text[:idx]
chapter_text += text

# Null characters are not allowed.
chapter_text = chapter_text.replace('\0', '')

# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
Expand Down

0 comments on commit 636d900

Please sign in to comment.