improve nimbus note conversion (by adding html preprocessors)

marph91 · Oct 18, 2024 · d28bdc9 · d28bdc9
1 parent ae77229
commit d28bdc9
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 3 deletions.
diff --git a/src/formats/nimbus_note.py b/src/formats/nimbus_note.py
@@ -3,12 +3,33 @@
 import base64
 from pathlib import Path
 
+from bs4 import BeautifulSoup
+
 import common
 import converter
 import intermediate_format as imf
 import markdown_lib.common
 
 
+def clean_tables(soup):
+    for table in soup.find_all("table"):
+        tags_to_remove = ["div", "span"]
+        for tag in tags_to_remove:
+            for element in table.find_all(tag):
+                element.unwrap()
+
+
+def assign_lists(_soup):
+    # TODO:
+    # - all lists are unnumbered lists (ul)
+    #   - type is in the class attr (list-item-number, -bullet, -checkbox)
+    # - indentation is in the class attr (indent-0)
+
+    # for task_list in soup.find_all("ul", class_="checklist"):
+    #     ...
+    pass
+
+
 class Converter(converter.BaseConverter):
     accept_folder = True
 
@@ -55,8 +76,15 @@ def convert(self, file_or_folder: Path):
             common.extract_zip(file_, temp_folder=temp_folder_note)
 
             # HTML note seems to have the name "note.html" always
-            note_body_html = (temp_folder_note / "note.html").read_text("utf-8")
-            note_body_markdown = markdown_lib.common.markup_to_markdown(note_body_html)
+            note_body_html = (temp_folder_note / "note.html").read_text(
+                encoding="utf-8"
+            )
+
+            soup = BeautifulSoup(note_body_html, "html.parser")
+            clean_tables(soup)
+            assign_lists(soup)
+
+            note_body_markdown = markdown_lib.common.markup_to_markdown(str(soup))
             resources = self.handle_markdown_links(note_body_markdown, temp_folder_note)
             note_imf = imf.Note(
                 title,

diff --git a/test/data b/test/data
+ −		reference_data/nimbus_note/test_2/7yYuquWJELF0nHg2.png
+ −		reference_data/nimbus_note/test_2/8fbeADk4RfQEvPIw.png
+ −		reference_data/nimbus_note/test_2/CtHwoda57FsWnKMk.png
+87 −0		reference_data/nimbus_note/test_2/New_page_in_new_folder.md
+ −		reference_data/nimbus_note/test_2/Riblw4qWTJHm3OHJ.png
+139 −0		reference_data/nimbus_note/test_2/Start_with_FuseBase_Here_Skyrocket_Your_Workflow.md
+3 −0		reference_data/nimbus_note/test_2/XfOyFr3jQ92lni7u.txt
+ −		reference_data/nimbus_note/test_2/dEHnSD9DuIf1tZ97.png
+ −		reference_data/nimbus_note/test_2/e1GHfrAAh4S96o7k.png
+ −		reference_data/nimbus_note/test_2/fhV0eOXTt9oJe7Zt.png
+ −		reference_data/nimbus_note/test_2/i6eQ1mLFg6Inx7md.png
+1 −0		reference_data/nimbus_note/test_2/unnamed_17fc695a07a04a6e8822e8f36c031199.svg
+1 −0		reference_data/nimbus_note/test_2/unnamed_23b8c1e9392446debeb13b9046685257.svg
+1 −0		reference_data/nimbus_note/test_2/unnamed_972a846916414f828b9d2434e465e150.svg
+1 −0		reference_data/nimbus_note/test_2/unnamed_bd9c66b3ad3c4d6d9a3d1fa7bc8960a9.svg
+1 −0		reference_data/nimbus_note/test_2/unnamed_bdd640fb06674ad19c80317fa3b1799d.svg
+ −		test_data/nimbus_note/test_2/Demo Workspace/New_page_in_new_folder.zip
+ −		test_data/nimbus_note/test_2/Demo Workspace/Start_with_FuseBase_Here_Skyrocket_Your_Workflow.zip