refactor: CC XML parsing moved to a separate module

raccoongang · Jan 10, 2025 · 0a321da · 0a321da
1 parent 1467d38
commit 0a321da
Show file tree

Hide file tree

Showing 14 changed files with 634 additions and 227 deletions.
diff --git a/src/cc2olx/constants.py b/src/cc2olx/constants.py
@@ -3,9 +3,6 @@
 WEB_RESOURCES_DIR_NAME = "web_resources"
 
 LINK_HTML = "<a href='{url}'>{text}</a>"
-WEB_LINK_NAMESPACE = (
-    "http://www.imsglobal.org/xsd/imsccv{major_version}p{minor_version}/imswl_v{major_version}p{minor_version}"
-)
 YOUTUBE_LINK_PATTERN = r"youtube.com/watch\?v=(?P<video_id>[-\w]+)"
 CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
 

diff --git a/src/cc2olx/content_parsers/discussion.py b/src/cc2olx/content_parsers/discussion.py
@@ -12,18 +12,11 @@ class DiscussionContentParser(AbstractContentParser):
     Discussion resource content parser.
     """
 
-    NAMESPACES = {
-        "imsdt_xmlv1p1": "http://www.imsglobal.org/xsd/imsccv1p1/imsdt_v1p1",
-        "imsdt_xmlv1p2": "http://www.imsglobal.org/xsd/imsccv1p2/imsdt_v1p2",
-        "imsdt_xmlv1p3": "http://www.imsglobal.org/xsd/imsccv1p3/imsdt_v1p3",
-    }
-
     def _parse_content(self, idref: Optional[str]) -> Optional[Dict[str, str]]:
         if idref:
             if resource := self._cartridge.define_resource(idref):
                 if re.match(CommonCartridgeResourceType.DISCUSSION_TOPIC, resource["type"]):
-                    data = self._parse_discussion(resource)
-                    return data
+                    return self._parse_discussion(resource)
         return None
 
     def _parse_discussion(self, resource: dict) -> Dict[str, str]:
@@ -42,9 +35,10 @@ def _parse_resource_file_data(self, resource_file: ResourceFile, resource_type:
         """
         Parse the discussion resource file.
         """
-        tree = filesystem.get_xml_tree(self._cartridge.build_res_file_path(resource_file.href))
+        tree = filesystem.get_xml_tree(self._cartridge.build_resource_file_path(resource_file.href))
         root = tree.getroot()
-        ns = {"dt": self.NAMESPACES[resource_type]}
-        title = root.find("dt:title", ns).text
-        text = root.find("dt:text", ns).text
-        return {"title": title, "text": text}
+
+        return {
+            "title": root.get_title(resource_type).text,
+            "text": root.get_text(resource_type).text,
+        }
diff --git a/src/cc2olx/content_parsers/html.py b/src/cc2olx/content_parsers/html.py
@@ -43,41 +43,41 @@ def _parse_webcontent(self, idref: str, resource: dict) -> Dict[str, str]:
         """
         Parse the resource with "webcontent" type.
         """
-        res_file = resource["children"][0]
-        res_relative_link = res_file.href
-        res_file_path = self._cartridge.build_res_file_path(res_relative_link)
-
-        if res_file_path.suffix == HTML_FILENAME_SUFFIX:
-            content = self._parse_webcontent_html_file(idref, res_file_path)
-        elif WEB_RESOURCES_DIR_NAME in str(res_file_path) and imghdr.what(str(res_file_path)):
-            content = self._parse_image_webcontent_from_web_resources_dir(res_file_path)
-        elif WEB_RESOURCES_DIR_NAME not in str(res_file_path):
-            content = self._parse_webcontent_outside_web_resources_dir(res_relative_link)
+        resource_file = resource["children"][0]
+        resource_relative_link = resource_file.href
+        resource_file_path = self._cartridge.build_resource_file_path(resource_relative_link)
+
+        if resource_file_path.suffix == HTML_FILENAME_SUFFIX:
+            content = self._parse_webcontent_html_file(idref, resource_file_path)
+        elif WEB_RESOURCES_DIR_NAME in str(resource_file_path) and imghdr.what(str(resource_file_path)):
+            content = self._parse_image_webcontent_from_web_resources_dir(resource_file_path)
+        elif WEB_RESOURCES_DIR_NAME not in str(resource_file_path):
+            content = self._parse_webcontent_outside_web_resources_dir(resource_relative_link)
         else:
-            logger.info("Skipping webcontent: %s", res_file_path)
+            logger.info("Skipping webcontent: %s", resource_file_path)
             content = self.DEFAULT_CONTENT
 
         return content
 
     @staticmethod
-    def _parse_webcontent_html_file(idref: str, res_file_path: Path) -> Dict[str, str]:
+    def _parse_webcontent_html_file(idref: str, resource_file_path: Path) -> Dict[str, str]:
         """
         Parse webcontent HTML file.
         """
         try:
-            with open(res_file_path, encoding="utf-8") as res_file:
-                html = res_file.read()
+            with open(resource_file_path, encoding="utf-8") as resource_file:
+                html = resource_file.read()
         except:  # noqa: E722
-            logger.error("Failure reading %s from id %s", res_file_path, idref)  # noqa: E722
+            logger.error("Failure reading %s from id %s", resource_file_path, idref)  # noqa: E722
             raise
         return {"html": html}
 
     @staticmethod
-    def _parse_image_webcontent_from_web_resources_dir(res_file_path: Path) -> Dict[str, str]:
+    def _parse_image_webcontent_from_web_resources_dir(resource_file_path: Path) -> Dict[str, str]:
         """
         Parse webcontent image from "web_resources" directory.
         """
-        static_filename = str(res_file_path).split(f"{WEB_RESOURCES_DIR_NAME}/")[1]
+        static_filename = str(resource_file_path).split(f"{WEB_RESOURCES_DIR_NAME}/")[1]
         olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
         image_webcontent_tpl_path = settings.TEMPLATES_DIR / "image_webcontent.html"
 
@@ -87,19 +87,19 @@ def _parse_image_webcontent_from_web_resources_dir(res_file_path: Path) -> Dict[
 
         return {"html": html}
 
-    def _parse_webcontent_outside_web_resources_dir(self, res_relative_path: str) -> Dict[str, str]:
+    def _parse_webcontent_outside_web_resources_dir(self, resource_relative_path: str) -> Dict[str, str]:
         """
         Parse webcontent located outside "web_resources" directory.
         """
         # This webcontent is outside ``web_resources`` directory
         # So we need to manually copy it to OLX_STATIC_DIR
-        self._cartridge.add_extra_static_file(res_relative_path)
-        olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=res_relative_path)
+        self._cartridge.add_extra_static_file(resource_relative_path)
+        olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=resource_relative_path)
         external_webcontent_tpl_path = settings.TEMPLATES_DIR / "external_webcontent.html"
 
         with open(external_webcontent_tpl_path, encoding="utf-8") as external_webcontent_tpl:
             tpl_content = external_webcontent_tpl.read()
-            html = tpl_content.format(olx_static_path=olx_static_path, res_relative_path=res_relative_path)
+            html = tpl_content.format(olx_static_path=olx_static_path, resource_relative_path=resource_relative_path)
 
         return {"html": html}
 

diff --git a/src/cc2olx/content_parsers/lti.py b/src/cc2olx/content_parsers/lti.py
@@ -1,24 +1,18 @@
 import re
 from typing import Dict, Optional
 
-from lxml import etree
-
 from cc2olx import filesystem
 from cc2olx.content_parsers import AbstractContentParser
 from cc2olx.enums import CommonCartridgeResourceType
 from cc2olx.utils import simple_slug
+from cc2olx.xml import cc_xml
 
 
 class LtiContentParser(AbstractContentParser):
     """
     LTI resource content parser.
     """
 
-    NAMESPACES = {
-        "blti": "http://www.imsglobal.org/xsd/imsbasiclti_v1p0",
-        "lticp": "http://www.imsglobal.org/xsd/imslticp_v1p0",
-        "lticm": "http://www.imsglobal.org/xsd/imslticm_v1p0",
-    }
     DEFAULT_WIDTH = "500"
     DEFAULT_HEIGHT = "500"
 
@@ -38,57 +32,54 @@ def _parse_lti(self, resource: dict) -> dict:
         """
         Parse LTI resource.
         """
-        res_file = resource["children"][0]
-        res_file_path = self._cartridge.build_res_file_path(res_file.href)
-        tree = filesystem.get_xml_tree(res_file_path)
+        resource_file = resource["children"][0]
+        resource_file_path = self._cartridge.build_resource_file_path(resource_file.href)
+        tree = filesystem.get_xml_tree(resource_file_path)
         root = tree.getroot()
-        title = root.find("blti:title", self.NAMESPACES).text
-        description = root.find("blti:description", self.NAMESPACES).text
-        data = {
+        title = root.title.text
+
+        return {
             "title": title,
-            "description": description,
+            "description": root.description.text,
             "launch_url": self._parse_launch_url(root),
             "height": self._parse_height(root),
             "width": self._parse_width(root),
             "custom_parameters": self._parse_custom_parameters(root),
             "lti_id": self._parse_lti_id(root, title),
         }
-        return data
 
-    def _parse_launch_url(self, resource_root: etree._Element) -> str:
+    def _parse_launch_url(self, resource_root: cc_xml.BasicLtiLink) -> str:
         """
         Parse URL to launch LTI.
         """
-        if (launch_url := resource_root.find("blti:secure_launch_url", self.NAMESPACES)) is None:
-            launch_url = resource_root.find("blti:launch_url", self.NAMESPACES)
+        if (launch_url := resource_root.secure_launch_url) is None:
+            launch_url = resource_root.launch_url
         return getattr(launch_url, "text", "")
 
-    def _parse_width(self, resource_root: etree._Element) -> str:
+    def _parse_width(self, resource_root: cc_xml.BasicLtiLink) -> str:
         """
         Parse width.
         """
-        width = resource_root.find("blti:extensions/lticm:property[@name='selection_width']", self.NAMESPACES)
-        return getattr(width, "text", self.DEFAULT_WIDTH)
+        return getattr(resource_root.width, "text", self.DEFAULT_WIDTH)
 
-    def _parse_height(self, resource_root: etree._Element) -> str:
+    def _parse_height(self, resource_root: cc_xml.BasicLtiLink) -> str:
         """
         Parse height.
         """
-        height = resource_root.find("blti:extensions/lticm:property[@name='selection_height']", self.NAMESPACES)
-        return getattr(height, "text", self.DEFAULT_HEIGHT)
+        return getattr(resource_root.height, "text", self.DEFAULT_HEIGHT)
 
-    def _parse_custom_parameters(self, resource_root: etree._Element) -> Dict[str, str]:
+    def _parse_custom_parameters(self, resource_root: cc_xml.BasicLtiLink) -> Dict[str, str]:
         """
         Parse custom parameters.
         """
-        custom = resource_root.find("blti:custom", self.NAMESPACES)
+        custom = resource_root.custom
         return {} if custom is None else {option.get("name"): option.text for option in custom}
 
-    def _parse_lti_id(self, resource_root: etree._Element, title: str) -> str:
+    def _parse_lti_id(self, resource_root: cc_xml.BasicLtiLink, title: str) -> str:
         """
         Parse LTI identifier.
 
         For Canvas flavored CC, tool_id is used as lti_id if present.
         """
-        tool_id = resource_root.find("blti:extensions/lticm:property[@name='tool_id']", self.NAMESPACES)
+        tool_id = resource_root.canvas_tool_id
         return simple_slug(title) if tool_id is None else tool_id.text
diff --git a/src/cc2olx/content_parsers/mixins.py b/src/cc2olx/content_parsers/mixins.py
@@ -2,7 +2,6 @@
 from typing import Dict, Optional
 
 from cc2olx import filesystem
-from cc2olx.constants import WEB_LINK_NAMESPACE
 from cc2olx.enums import CommonCartridgeResourceType
 from cc2olx.models import Cartridge
 
@@ -18,24 +17,14 @@ def _parse_web_link_content(self, resource: dict) -> Optional[Dict[str, str]]:
         """
         Provide Web Link resource data.
         """
-        if web_link_match := re.match(CommonCartridgeResourceType.WEB_LINK, resource["type"]):
-            res_file = resource["children"][0]
-            res_file_path = self._cartridge.build_res_file_path(res_file.href)
-            tree = filesystem.get_xml_tree(res_file_path)
+        resource_type = resource["type"]
+        if re.match(CommonCartridgeResourceType.WEB_LINK, resource_type):
+            resource_file = resource["children"][0]
+            resource_file_path = self._cartridge.build_resource_file_path(resource_file.href)
+            tree = filesystem.get_xml_tree(resource_file_path)
             root = tree.getroot()
-            ns = self._build_web_link_namespace(web_link_match)
-            title = root.find("wl:title", ns).text
-            url = root.find("wl:url", ns).get("href")
-            return {"href": url, "text": title}
+            return {
+                "href": root.get_url(resource_type).get("href"),
+                "text": root.get_title(resource_type).text,
+            }
         return None
-
-    @staticmethod
-    def _build_web_link_namespace(web_link_match: re.Match) -> Dict[str, str]:
-        """
-        Build Web Link namespace.
-        """
-        web_link = WEB_LINK_NAMESPACE.format(
-            major_version=web_link_match.group("major_version"),
-            minor_version=web_link_match.group("minor_version"),
-        )
-        return {"wl": web_link}