fix: update html_parser for <ul> and <b>/<strong> for rtl dir (#4093)

openedx · Sep 20, 2023 · 8a19585 · 8a19585
1 parent 637416d
commit 8a19585
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 4 deletions.
diff --git a/course_discovery/apps/course_metadata/tests/test_utils.py b/course_discovery/apps/course_metadata/tests/test_utils.py
@@ -654,6 +654,13 @@ class UtilsTests(TestCase):
         # Make sure to add dir attribute to p tags if they are in attribute list
         ('<p dir="rtl" class="float">Directed paragraph</p>', '<p dir="rtl">Directed paragraph</p>'),
 
+        # Check for ul and ol tags with dir attribute
+        ('<ul dir="rtl"><li>Directed list item</li></ul>', '<ul dir="rtl">\n<li>Directed list item</li>\n</ul>'),
+        ('<ol dir="rtl"><li>Directed list item</li></ol>', '<ol dir="rtl">\n<li>Directed list item</li>\n</ol>'),
+
+        # Make sure text remains bold if p tag has rtl direction
+        ('<p dir="rtl"><strong>Directed paragraph</strong></p>', '<p dir="rtl"><strong>Directed paragraph</strong></p>'),
+
         # Make sure that only spans with lang tags are preserved in the saved string
         ('<p><span lang="en">with lang</span></p>', '<p><span lang="en">with lang</span></p>'),
         ('<p><span class="body" lang="en">lang and class</span></p>', '<p><span lang="en">lang and class</span></p>'),

diff --git a/course_discovery/apps/course_metadata/utils.py b/course_discovery/apps/course_metadata/utils.py
@@ -693,7 +693,25 @@ def whitelist_html_tags_attribute(self, tag, dict_attrs, start):
                 self.outtextf(f'</{tag}>')
 
     def handle_tag(self, tag, attrs, start):
-        super().handle_tag(tag, attrs, start)
+        """
+        This method overrides the default behavior of html2text behavior for <span> tags (adding 'lang' attribute)
+        and <p> tags (adding 'dir' attribute). Additionally, within <p dir="rtl"> tags, it retains the original text
+        format to ensure consistent handling, addressing an issue where inner HTML content was not converting back to
+        HTML.
+        """
+        if not self.is_p_tag_with_dir:
+            super().handle_tag(tag, attrs, start)
+
+        elif tag not in HTML_TAGS_ATTRIBUTE_WHITELIST:
+            if start:
+                self.outtextf(f'<{tag}')
+                if attrs:
+                    self.outtextf(' ')
+                    self.outtextf(' '.join(f'{attr}="{value}"' for attr, value in attrs))
+                self.outtextf('>')
+            else:
+                self.outtextf(f'</{tag}>')
+
         if tag == 'span':
             if attrs and start and 'lang' in dict(attrs):
                 self.outtextf(f'<span lang="{dict(attrs)["lang"]}">')
@@ -705,13 +723,29 @@ def handle_tag(self, tag, attrs, start):
 
 
 def clean_html(content):
-    """Cleans HTML from a string.
+    """
+    Cleans HTML from a string.
 
     This method converts the HTML to a Markdown string (to remove styles, classes, and other unsupported
-    attributes), and converts the Markdown back to HTML.
+    attributes), and converts the Markdown back to HTML. This is done to ensure that the HTML is as clean as
+    possible, and that it is consistent with the HTML that is generated by the tinymce editor.
+
+    Additionally, if certain HTML tags (e.g., <ul>, <ol>) contain the 'dir' attribute with the value 'rtl'
+    (indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved
+    or added to maintain consistency with the original content.
     """
+    LIST_TAGS = ['ul', 'ol']
+    is_list_with_dir_attr_present = False
+
     cleaned = content.replace('&nbsp;', '')  # Keeping the removal of nbsps for historical consistency
-    cleaned = str(BeautifulSoup(cleaned, 'lxml'))
+    # Parse the HTML using BeautifulSoup
+    soup = BeautifulSoup(cleaned, 'lxml')
+
+    for tag in soup.find_all(LIST_TAGS, dir="rtl"):
+        tag.attrs.pop('dir')
+        is_list_with_dir_attr_present = True
+
+    cleaned = str(soup)
     # Need to re-replace the · middot with the entity so that html2text can transform it to * for <ul> in markdown
     cleaned = cleaned.replace('·', '&middot;')
     # Need to clean empty <b> and <p> tags which are converted to <hr/> by html2text
@@ -720,6 +754,8 @@ def clean_html(content):
     html_converter.wrap_links = False
     cleaned = html_converter.handle(cleaned).strip()
     cleaned = markdown.markdown(cleaned)
+    for tag in LIST_TAGS:
+        cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned
 
     return cleaned