When rendering html to text, pick links from <img> tags if present

15532th · Apr 18, 2024 · 49d78aa · 49d78aa
1 parent 9d88a77
commit 49d78aa
Showing 1 changed file with 10 additions and 1 deletion.
diff --git a/avtdl/core/utils.py b/avtdl/core/utils.py
@@ -434,6 +434,15 @@ def html_to_text(html: str) -> str:
     """take html fragment, try to parse it and extract text values using lxml"""
     try:
         root = lxml.html.fromstring(html)
+
+        # text_content() skips <img> content altogether
+        # walk tree manually and for images containing links
+        # add them to text representation
+        for elem in root.iter():
+            if elem.tag == 'img':
+                image_link = elem.get('src')
+                if image_link is not None:
+                    elem.text = f'\n{image_link}\n'
         text = root.text_content()
         return text
     except Exception as e:
@@ -469,4 +478,4 @@ def get_cookie_value(jar: aiohttp.CookieJar, key: str) -> Optional[str]:
     for morsel in jar:
         if morsel.key == key:
             return morsel.value
-    return None
+    return None