diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index 4d835ca11a..7d0549c8d9 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -9,7 +9,10 @@ import re from typing import List -import fitz +import pypdf +import os +import tempfile +from langchain_community.document_loaders import PyPDFLoader from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel @@ -24,16 +27,45 @@ def number_to_text(pdf_document, page_number): - page = pdf_document.load_page(page_number) - text = page.get_text() - return text + return pdf_document[page_number].page_content + + +def check_pdf_is_image(pdf_path): + try: + # 打开PDF文件 + with open(pdf_path, "rb") as f: + reader = pypdf.PdfReader(f) + for page_num in range(len(reader.pages)): + page = reader.pages[page_num] + + # 尝试提取文本 + text = page.extract_text() + if text and text.strip(): # 如果页面中有文本内容 + return False # 不是纯图片 + + # 如果所有页面都没有提取到文本内容 + return True # 可能是图片或扫描件 + + except Exception as e: + print(f"Error: {e}") + return None class PdfSplitHandle(BaseSplitHandle): - def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer,save_image): + def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + # 将上传的文件保存到临时文件中 + for chunk in file.chunks(): + temp_file.write(chunk) + # 获取临时文件的路径 + temp_file_path = temp_file.name + try: - buffer = get_buffer(file) - pdf_document = fitz.open(file.name, buffer) + if check_pdf_is_image(temp_file_path): + loader = PyPDFLoader(temp_file_path, extract_images=True) + else: + loader = PyPDFLoader(temp_file_path, extract_images=False) + pdf_document = loader.load() content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))]) if pattern_list is not None and len(pattern_list) > 0: split_model = SplitModel(pattern_list, with_filter, limit) @@ -42,6 +74,10 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu except BaseException as e: return {'name': file.name, 'content': []} + finally: + # 处理完后可以删除临时文件 + os.remove(temp_file_path) + return {'name': file.name, 'content': split_model.parse(content) } diff --git a/pyproject.toml b/pyproject.toml index 135cc35117..7bc097754b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,8 @@ langchain-openai = "^0.1.8" django-ipware = "^6.0.4" django-apscheduler = "^0.6.2" pymupdf = "1.24.1" +pypdf = "4.3.1" +rapidocr-onnxruntime = "1.3.24" python-docx = "^1.1.0" xlwt = "^1.3.0" dashscope = "^1.17.0"