refactor: 支持解析pdf中的图片

1Panel-dev · Aug 15, 2024 · a1c57e6 · a1c57e6
1 parent 87c17de
commit a1c57e6
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 6 deletions.
diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py
@@ -10,6 +10,9 @@
 from typing import List
 
 import fitz
+import os
+import tempfile
+from langchain_community.document_loaders import PyPDFLoader
 
 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
@@ -24,16 +27,32 @@
 
 
 def number_to_text(pdf_document, page_number):
-    page = pdf_document.load_page(page_number)
-    text = page.get_text()
-    return text
+    return pdf_document[page_number].page_content
 
+def is_text_based_pdf(filename):
+    '''
+    检查pdf是否是文本类型的pdf
+    '''
+    doc = fitz.open(filename)
+    for page in doc:
+        if page.get_text(): return True
+    return False
 
 class PdfSplitHandle(BaseSplitHandle):
-    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer,save_image):
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            # 将上传的文件保存到临时文件中
+            for chunk in file.chunks():
+                temp_file.write(chunk)
+            # 获取临时文件的路径
+            temp_file_path = temp_file.name
+
         try:
-            buffer = get_buffer(file)
-            pdf_document = fitz.open(file.name, buffer)
+            if is_text_based_pdf(temp_file_path):
+                loader = PyPDFLoader(temp_file_path, extract_images=False)
+            else:
+                loader = PyPDFLoader(temp_file_path, extract_images=True)
+            pdf_document = loader.load()
             content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
             if pattern_list is not None and len(pattern_list) > 0:
                 split_model = SplitModel(pattern_list, with_filter, limit)
@@ -42,6 +61,10 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
         except BaseException as e:
             return {'name': file.name,
                     'content': []}
+        finally:
+            # 处理完后可以删除临时文件
+            os.remove(temp_file_path)
+
         return {'name': file.name,
                 'content': split_model.parse(content)
                 }

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,8 @@ langchain-openai = "^0.1.8"
 django-ipware = "^6.0.4"
 django-apscheduler = "^0.6.2"
 pymupdf = "1.24.1"
+pypdf = "4.3.1"
+rapidocr-onnxruntime = "1.3.24"
 python-docx = "^1.1.0"
 xlwt = "^1.3.0"
 dashscope = "^1.17.0"