Skip to content

Commit

Permalink
refactor: 支持解析pdf中的图片
Browse files Browse the repository at this point in the history
  • Loading branch information
liuruibin committed Aug 15, 2024
1 parent 87c17de commit a1c57e6
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 6 deletions.
35 changes: 29 additions & 6 deletions apps/common/handle/impl/pdf_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from typing import List

import fitz
import os
import tempfile
from langchain_community.document_loaders import PyPDFLoader

from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel
Expand All @@ -24,16 +27,32 @@


def number_to_text(pdf_document, page_number):
page = pdf_document.load_page(page_number)
text = page.get_text()
return text
return pdf_document[page_number].page_content

def is_text_based_pdf(filename):
'''
检查pdf是否是文本类型的pdf
'''
doc = fitz.open(filename)
for page in doc:
if page.get_text(): return True
return False

class PdfSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer,save_image):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
# 将上传的文件保存到临时文件中
for chunk in file.chunks():
temp_file.write(chunk)
# 获取临时文件的路径
temp_file_path = temp_file.name

try:
buffer = get_buffer(file)
pdf_document = fitz.open(file.name, buffer)
if is_text_based_pdf(temp_file_path):
loader = PyPDFLoader(temp_file_path, extract_images=False)
else:
loader = PyPDFLoader(temp_file_path, extract_images=True)
pdf_document = loader.load()
content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
Expand All @@ -42,6 +61,10 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
except BaseException as e:
return {'name': file.name,
'content': []}
finally:
# 处理完后可以删除临时文件
os.remove(temp_file_path)

return {'name': file.name,
'content': split_model.parse(content)
}
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ langchain-openai = "^0.1.8"
django-ipware = "^6.0.4"
django-apscheduler = "^0.6.2"
pymupdf = "1.24.1"
pypdf = "4.3.1"
rapidocr-onnxruntime = "1.3.24"
python-docx = "^1.1.0"
xlwt = "^1.3.0"
dashscope = "^1.17.0"
Expand Down

0 comments on commit a1c57e6

Please sign in to comment.