Skip to content

Commit

Permalink
When rendering html to text, pick links from <img> tags if present
Browse files Browse the repository at this point in the history
  • Loading branch information
user committed Apr 18, 2024
1 parent 9d88a77 commit 49d78aa
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion avtdl/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,15 @@ def html_to_text(html: str) -> str:
"""take html fragment, try to parse it and extract text values using lxml"""
try:
root = lxml.html.fromstring(html)

# text_content() skips <img> content altogether
# walk tree manually and for images containing links
# add them to text representation
for elem in root.iter():
if elem.tag == 'img':
image_link = elem.get('src')
if image_link is not None:
elem.text = f'\n{image_link}\n'
text = root.text_content()
return text
except Exception as e:
Expand Down Expand Up @@ -469,4 +478,4 @@ def get_cookie_value(jar: aiohttp.CookieJar, key: str) -> Optional[str]:
for morsel in jar:
if morsel.key == key:
return morsel.value
return None
return None

0 comments on commit 49d78aa

Please sign in to comment.