From 7a3e9223caf5ff221798bcc4b6d61f8b3ad926bf Mon Sep 17 00:00:00 2001
From: masquare <masquare@users.noreply.github.com>
Date: Mon, 27 Jan 2025 13:18:40 +0100
Subject: [PATCH] feat(pptx): support image description with LLM for pptx files

---
 src/markitdown/_markitdown.py | 62 +++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index ae6a7b4..d130ffe 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -784,6 +784,35 @@ class PptxConverter(HtmlConverter):
     Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
     """
 
+    def _get_llm_description(
+        self, llm_client, llm_model, image_blob, content_type, prompt=None
+    ):
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed alt text for this image with less than 50 words."
+
+        image_base64 = base64.b64encode(image_blob).decode("utf-8")
+        data_uri = f"data:{content_type};base64,{image_base64}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri,
+                        },
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+
+        response = llm_client.chat.completions.create(
+            model=llm_model, messages=messages
+        )
+        return response.choices[0].message.content
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a PPTX
         extension = kwargs.get("file_extension", "")
@@ -804,17 +833,38 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
                 # Pictures
                 if self._is_picture(shape):
                     # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
-                    alt_text = ""
-                    try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
-                    except Exception:
-                        pass
+
+                    llm_description = None
+                    alt_text = None
+
+                    llm_client = kwargs.get("llm_client")
+                    llm_model = kwargs.get("llm_model")
+                    if llm_client is not None and llm_model is not None:
+                        try:
+                            llm_description = self._get_llm_description(
+                                llm_client,
+                                llm_model,
+                                shape.image.blob,
+                                shape.image.content_type,
+                            )
+                        except Exception:
+                            # Unable to describe with LLM
+                            pass
+
+                    if not llm_description:
+                        try:
+                            alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+                                "descr", ""
+                            )
+                        except Exception:
+                            # Unable to get alt text
+                            pass
 
                     # A placeholder name
                     filename = re.sub(r"\W", "", shape.name) + ".jpg"
                     md_content += (
                         "\n!["
-                        + (alt_text if alt_text else shape.name)
+                        + (llm_description or alt_text or shape.name)
                         + "]("
                         + filename
                         + ")\n"