Skip to content

Commit

Permalink
fix: update html_parser for <ul> and <b>/<strong> for rtl dir (#4093)
Browse files Browse the repository at this point in the history
  • Loading branch information
AfaqShuaib09 authored Sep 20, 2023
1 parent 637416d commit 8a19585
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
7 changes: 7 additions & 0 deletions course_discovery/apps/course_metadata/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,13 @@ class UtilsTests(TestCase):
# Make sure to add dir attribute to p tags if they are in attribute list
('<p dir="rtl" class="float">Directed paragraph</p>', '<p dir="rtl">Directed paragraph</p>'),
# Check for ul and ol tags with dir attribute
('<ul dir="rtl"><li>Directed list item</li></ul>', '<ul dir="rtl">\n<li>Directed list item</li>\n</ul>'),
('<ol dir="rtl"><li>Directed list item</li></ol>', '<ol dir="rtl">\n<li>Directed list item</li>\n</ol>'),
# Make sure text remains bold if p tag has rtl direction
('<p dir="rtl"><strong>Directed paragraph</strong></p>', '<p dir="rtl"><strong>Directed paragraph</strong></p>'),
# Make sure that only spans with lang tags are preserved in the saved string
('<p><span lang="en">with lang</span></p>', '<p><span lang="en">with lang</span></p>'),
('<p><span class="body" lang="en">lang and class</span></p>', '<p><span lang="en">lang and class</span></p>'),
Expand Down
44 changes: 40 additions & 4 deletions course_discovery/apps/course_metadata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,25 @@ def whitelist_html_tags_attribute(self, tag, dict_attrs, start):
self.outtextf(f'</{tag}>')

def handle_tag(self, tag, attrs, start):
super().handle_tag(tag, attrs, start)
"""
This method overrides the default behavior of html2text behavior for <span> tags (adding 'lang' attribute)
and <p> tags (adding 'dir' attribute). Additionally, within <p dir="rtl"> tags, it retains the original text
format to ensure consistent handling, addressing an issue where inner HTML content was not converting back to
HTML.
"""
if not self.is_p_tag_with_dir:
super().handle_tag(tag, attrs, start)

elif tag not in HTML_TAGS_ATTRIBUTE_WHITELIST:
if start:
self.outtextf(f'<{tag}')
if attrs:
self.outtextf(' ')
self.outtextf(' '.join(f'{attr}="{value}"' for attr, value in attrs))
self.outtextf('>')
else:
self.outtextf(f'</{tag}>')

if tag == 'span':
if attrs and start and 'lang' in dict(attrs):
self.outtextf(f'<span lang="{dict(attrs)["lang"]}">')
Expand All @@ -705,13 +723,29 @@ def handle_tag(self, tag, attrs, start):


def clean_html(content):
"""Cleans HTML from a string.
"""
Cleans HTML from a string.
This method converts the HTML to a Markdown string (to remove styles, classes, and other unsupported
attributes), and converts the Markdown back to HTML.
attributes), and converts the Markdown back to HTML. This is done to ensure that the HTML is as clean as
possible, and that it is consistent with the HTML that is generated by the tinymce editor.
Additionally, if certain HTML tags (e.g., <ul>, <ol>) contain the 'dir' attribute with the value 'rtl'
(indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved
or added to maintain consistency with the original content.
"""
LIST_TAGS = ['ul', 'ol']
is_list_with_dir_attr_present = False

cleaned = content.replace('&nbsp;', '') # Keeping the removal of nbsps for historical consistency
cleaned = str(BeautifulSoup(cleaned, 'lxml'))
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(cleaned, 'lxml')

for tag in soup.find_all(LIST_TAGS, dir="rtl"):
tag.attrs.pop('dir')
is_list_with_dir_attr_present = True

cleaned = str(soup)
# Need to re-replace the · middot with the entity so that html2text can transform it to * for <ul> in markdown
cleaned = cleaned.replace('·', '&middot;')
# Need to clean empty <b> and <p> tags which are converted to <hr/> by html2text
Expand All @@ -720,6 +754,8 @@ def clean_html(content):
html_converter.wrap_links = False
cleaned = html_converter.handle(cleaned).strip()
cleaned = markdown.markdown(cleaned)
for tag in LIST_TAGS:
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned

return cleaned

Expand Down

0 comments on commit 8a19585

Please sign in to comment.