From 4d9355dec7ebf9cbe78bccacdd00e7bd58422263 Mon Sep 17 00:00:00 2001 From: CaptainB Date: Fri, 16 Aug 2024 12:57:06 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E9=80=90=E9=A1=B5=E5=8A=A0?= =?UTF-8?q?=E8=BD=BDpdf,=20=E5=9B=BE=E7=89=87=E7=B1=BB=E5=9E=8B=E5=8D=95?= =?UTF-8?q?=E7=8B=AC=E4=BF=9D=E5=AD=98=E6=88=90=E6=96=87=E4=BB=B6=E5=8A=A0?= =?UTF-8?q?=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/pdf_split_handle.py | 54 +++++++++++---------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index e4bac8d08ca..ecf4880bd59 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -9,7 +9,7 @@ import re from typing import List -import pypdf +from pypdf import PdfReader, PdfWriter import os import tempfile from langchain_community.document_loaders import PyPDFLoader @@ -17,6 +17,8 @@ from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel +import time + default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<=\\n)(? 0: split_model = SplitModel(pattern_list, with_filter, limit) else: split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + + except BaseException as e: return {'name': file.name, 'content': []}