refactor: 支持解析pdf中的图片

1Panel-dev · Aug 15, 2024 · 7ea7a0c · 7ea7a0c
1 parent 19b07e9
commit 7ea7a0c
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 7 deletions.
diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py
@@ -9,7 +9,10 @@
 import re
 from typing import List
 
-import fitz
+import pypdf
+import os
+import tempfile
+from langchain_community.document_loaders import PyPDFLoader
 
 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
@@ -24,16 +27,45 @@
 
 
 def number_to_text(pdf_document, page_number):
-    page = pdf_document.load_page(page_number)
-    text = page.get_text()
-    return text
+    return pdf_document[page_number].page_content
+
+
+def check_pdf_is_image(pdf_path):
+    try:
+        # 打开PDF文件
+        with open(pdf_path, "rb") as f:
+            reader = pypdf.PdfReader(f)
+            for page_num in range(len(reader.pages)):
+                page = reader.pages[page_num]
+
+                # 尝试提取文本
+                text = page.extract_text()
+                if text and text.strip():  # 如果页面中有文本内容
+                    return False  # 不是纯图片
+
+        # 如果所有页面都没有提取到文本内容
+        return True  # 可能是图片或扫描件
+
+    except Exception as e:
+        print(f"Error: {e}")
+        return None
 
 
 class PdfSplitHandle(BaseSplitHandle):
-    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer,save_image):
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            # 将上传的文件保存到临时文件中
+            for chunk in file.chunks():
+                temp_file.write(chunk)
+            # 获取临时文件的路径
+            temp_file_path = temp_file.name
+
         try:
-            buffer = get_buffer(file)
-            pdf_document = fitz.open(file.name, buffer)
+            if check_pdf_is_image(temp_file_path):
+                loader = PyPDFLoader(temp_file_path, extract_images=True)
+            else:
+                loader = PyPDFLoader(temp_file_path, extract_images=False)
+            pdf_document = loader.load()
             content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
             if pattern_list is not None and len(pattern_list) > 0:
                 split_model = SplitModel(pattern_list, with_filter, limit)
@@ -42,6 +74,10 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
         except BaseException as e:
             return {'name': file.name,
                     'content': []}
+        finally:
+            # 处理完后可以删除临时文件
+            os.remove(temp_file_path)
+
         return {'name': file.name,
                 'content': split_model.parse(content)
                 }

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,8 @@ langchain-openai = "^0.1.8"
 django-ipware = "^6.0.4"
 django-apscheduler = "^0.6.2"
 pymupdf = "1.24.1"
+pypdf = "4.3.1"
+rapidocr-onnxruntime = "1.3.24"
 python-docx = "^1.1.0"
 xlwt = "^1.3.0"
 dashscope = "^1.17.0"