Skip to content

Commit

Permalink
fix: 修复导入word文档,有的图片导入不进去
Browse files Browse the repository at this point in the history
  • Loading branch information
shaohuzhang1 committed Oct 28, 2024
1 parent e01bce1 commit e539038
Showing 1 changed file with 35 additions and 10 deletions.
45 changes: 35 additions & 10 deletions apps/common/handle/impl/doc_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re
import traceback
import uuid
from functools import reduce
from typing import List

from docx import Document, ImagePart
Expand All @@ -31,6 +32,7 @@
old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}


def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True):
if is_new_docx:
image_ids = image.xpath('.//a:blip/@r:embed')
Expand All @@ -46,18 +48,31 @@ def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=T
return f'![](/api/image/{image_uuid})'


def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id):
images_xpath_list = [".//pic:pic", ".//w:pict"]
images = []
for images_xpath in images_xpath_list:
try:
_images = paragraph_element.xpath(images_xpath)
if _images is not None and len(_images) > 0:
for image in _images:
images.append(image)
except Exception as e:
pass
return images


def images_to_string(images, doc: Document, images_list, get_image_id):
return "".join(
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
item is not None])


def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
try:
images = paragraph_element.xpath(".//pic:pic")
old_docx_images = paragraph_element.xpath(".//w:pict")
images = get_paragraph_element_images(paragraph_element, doc, images_list, get_image_id)
if len(images) > 0:
return "".join(
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
item is not None])
elif len(old_docx_images) > 0:
return "".join(
[item for item in [image_to_mode(image, doc, images_list, get_image_id, is_new_docx=False) for image in old_docx_images] if
item is not None])
return images_to_string(images, doc, images_list, get_image_id)
elif paragraph_element.text is not None:
return paragraph_element.text
return ""
Expand Down Expand Up @@ -101,8 +116,18 @@ def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_
try:
psn = paragraph.style.name
if psn.startswith('Heading'):
return "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
title = "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
images = reduce(lambda x, y: [*x, *y],
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
paragraph._element],
[])

if len(images) > 0:
return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
return title
except Exception as e:
traceback.print_exc()
return paragraph.text
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)

Expand Down

0 comments on commit e539038

Please sign in to comment.