Skip to content

Commit

Permalink
refactor: 支持解析pdf中的图片
Browse files Browse the repository at this point in the history
  • Loading branch information
liuruibin committed Aug 15, 2024
1 parent 19b07e9 commit 7ea7a0c
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 7 deletions.
50 changes: 43 additions & 7 deletions apps/common/handle/impl/pdf_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
import re
from typing import List

import fitz
import pypdf
import os
import tempfile
from langchain_community.document_loaders import PyPDFLoader

from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel
Expand All @@ -24,16 +27,45 @@


def number_to_text(pdf_document, page_number):
page = pdf_document.load_page(page_number)
text = page.get_text()
return text
return pdf_document[page_number].page_content


def check_pdf_is_image(pdf_path):
try:
# 打开PDF文件
with open(pdf_path, "rb") as f:
reader = pypdf.PdfReader(f)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]

# 尝试提取文本
text = page.extract_text()
if text and text.strip(): # 如果页面中有文本内容
return False # 不是纯图片

# 如果所有页面都没有提取到文本内容
return True # 可能是图片或扫描件

except Exception as e:
print(f"Error: {e}")
return None


class PdfSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer,save_image):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
# 将上传的文件保存到临时文件中
for chunk in file.chunks():
temp_file.write(chunk)
# 获取临时文件的路径
temp_file_path = temp_file.name

try:
buffer = get_buffer(file)
pdf_document = fitz.open(file.name, buffer)
if check_pdf_is_image(temp_file_path):
loader = PyPDFLoader(temp_file_path, extract_images=True)
else:
loader = PyPDFLoader(temp_file_path, extract_images=False)
pdf_document = loader.load()
content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
Expand All @@ -42,6 +74,10 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
except BaseException as e:
return {'name': file.name,
'content': []}
finally:
# 处理完后可以删除临时文件
os.remove(temp_file_path)

return {'name': file.name,
'content': split_model.parse(content)
}
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ langchain-openai = "^0.1.8"
django-ipware = "^6.0.4"
django-apscheduler = "^0.6.2"
pymupdf = "1.24.1"
pypdf = "4.3.1"
rapidocr-onnxruntime = "1.3.24"
python-docx = "^1.1.0"
xlwt = "^1.3.0"
dashscope = "^1.17.0"
Expand Down

0 comments on commit 7ea7a0c

Please sign in to comment.