Skip to content

Commit

Permalink
fix: 表格数据区分xls和xlsx
Browse files Browse the repository at this point in the history
  • Loading branch information
liuruibin committed Sep 12, 2024
1 parent c58635e commit 746f587
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 6 deletions.
47 changes: 47 additions & 0 deletions apps/common/handle/impl/table/xls_parse_table_handle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# coding=utf-8
import logging

import xlrd

from common.handle.base_parse_table_handle import BaseParseTableHandle

max_kb = logging.getLogger("max_kb")


class XlsSplitHandle(BaseParseTableHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
buffer = get_buffer(file)
if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
return True
return False

def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
wb = xlrd.open_workbook(file_contents=buffer)
result = []
sheets = wb.sheets()
for sheet in sheets:
paragraphs = []
rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
if not rows: continue
ti = next(rows)
for r in rows:
l = []
for i, c in enumerate(r):
if not c:
continue
t = str(ti[i]) if i < len(ti) else ""
t += (": " if t else "") + str(c)
l.append(t)
l = "; ".join(l)
if sheet.name.lower().find("sheet") < 0:
l += " ——" + sheet.name
paragraphs.append({'title': '', 'content': l})
result.append({'name': sheet.name, 'paragraphs': paragraphs})

except BaseException as e:
max_kb.error(f'excel split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return result
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
max_kb = logging.getLogger("max_kb")


class ExcelSplitHandle(BaseParseTableHandle):
class XlsxSplitHandle(BaseParseTableHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith('.xls') or file_name.endswith('.xlsx'):
if file_name.endswith('.xlsx'):
return True
return False

Expand All @@ -34,13 +34,11 @@ def handle(self, file, get_buffer, save_image):
if not rows: continue
ti = list(rows[0])
for r in list(rows[1:]):
title = []
l = []
for i, c in enumerate(r):
if not c.value:
continue
t = str(ti[i].value) if i < len(ti) else ""
title.append(t)
content = str(c.value)
image = image_dict.get(content, None)
if image is not None:
Expand Down
5 changes: 3 additions & 2 deletions apps/dataset/serializers/document_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
from common.handle.impl.table.csv_parse_table_handle import CsvSplitHandle
from common.handle.impl.table.excel_parse_table_handle import ExcelSplitHandle
from common.handle.impl.table.xlsx_parse_table_handle import XlsxSplitHandle
from common.handle.impl.table.xls_parse_table_handle import XlsSplitHandle
from common.handle.impl.text_split_handle import TextSplitHandle
from common.mixins.api_mixin import ApiMixin
from common.util.common import post, flat_map
Expand All @@ -53,7 +54,7 @@
from smartdoc.conf import PROJECT_DIR

parse_qa_handle_list = [XlsParseQAHandle(), CsvParseQAHandle(), XlsxParseQAHandle()]
parse_table_handle_list = [CsvSplitHandle(), ExcelSplitHandle()]
parse_table_handle_list = [CsvSplitHandle(), XlsSplitHandle(), XlsxSplitHandle()]


class FileBufferHandle:
Expand Down

0 comments on commit 746f587

Please sign in to comment.