refactor: [FC-0063] Block type processing is refactored

- `attrs` dependency is added - block type processors are implemented - block type processors are integrated into the script workflow
raccoongang · Jan 14, 2025 · 1650432 · 1650432
1 parent 1bfea42
commit 1650432
Show file tree

Hide file tree

Showing 59 changed files with 2,714 additions and 1,505 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,6 +1,7 @@
 include LICENSE
 include README.rst
 
+recursive-include src/cc2olx/templates *
 recursive-include requirements *
 recursive-include tests *
 recursive-exclude * __pycache__

diff --git a/pytest.ini b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
 usefixtures = chdir_to_workspace
-DJANGO_SETTINGS_MODULE = cc2olx.django_settings
+DJANGO_SETTINGS_MODULE = cc2olx.settings
diff --git a/requirements/base.in b/requirements/base.in
@@ -1,6 +1,7 @@
 # Core requirements for this package
 
 Django
+attrs
 lxml
 requests
 youtube-dl
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -6,6 +6,8 @@
 #
 asgiref==3.8.1
     # via django
+attrs==24.3.0
+    # via -r requirements/base.in
 backports-zoneinfo==0.2.1
     # via django
 certifi==2024.12.14

diff --git a/requirements/ci.txt b/requirements/ci.txt
@@ -8,6 +8,10 @@ asgiref==3.8.1
     # via
     #   -r /home/misha/work/cc2olx/requirements/quality.txt
     #   django
+attrs==24.3.0
+    # via
+    #   -c /home/misha/work/cc2olx/requirements/constraints.txt
+    #   -r /home/misha/work/cc2olx/requirements/quality.txt
 backports-zoneinfo==0.2.1
     # via
     #   -r /home/misha/work/cc2olx/requirements/quality.txt

diff --git a/requirements/constraints.txt b/requirements/constraints.txt
@@ -7,3 +7,5 @@
 # link to other information that will help people in the future to remove the
 # pin when possible.  Writing an issue against the offending project and
 # linking to it here is good.
+
+attrs==24.3.0
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -9,6 +9,11 @@ asgiref==3.8.1
     #   -r /home/misha/work/cc2olx/requirements/ci.txt
     #   -r /home/misha/work/cc2olx/requirements/quality.txt
     #   django
+attrs==24.3.0
+    # via
+    #   -c /home/misha/work/cc2olx/requirements/constraints.txt
+    #   -r /home/misha/work/cc2olx/requirements/ci.txt
+    #   -r /home/misha/work/cc2olx/requirements/quality.txt
 backports-tarfile==1.2.0
     # via jaraco-context
 backports-zoneinfo==0.2.1

diff --git a/requirements/quality.txt b/requirements/quality.txt
@@ -8,6 +8,10 @@ asgiref==3.8.1
     # via
     #   -r /home/misha/work/cc2olx/requirements/test.txt
     #   django
+attrs==24.3.0
+    # via
+    #   -c /home/misha/work/cc2olx/requirements/constraints.txt
+    #   -r /home/misha/work/cc2olx/requirements/test.txt
 backports-zoneinfo==0.2.1
     # via
     #   -r /home/misha/work/cc2olx/requirements/test.txt

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -8,6 +8,10 @@ asgiref==3.8.1
     # via
     #   -r /home/misha/work/cc2olx/requirements/base.txt
     #   django
+attrs==24.3.0
+    # via
+    #   -c /home/misha/work/cc2olx/requirements/constraints.txt
+    #   -r /home/misha/work/cc2olx/requirements/base.txt
 backports-zoneinfo==0.2.1
     # via
     #   -r /home/misha/work/cc2olx/requirements/base.txt

diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
         "Programming Language :: Python :: 3.8",
         "Topic :: Utilities",
     ],
-    description=("Command line tool, that converts Common Cartridge " "courses to Open edX Studio imports."),
+    description="Command line tool, that converts Common Cartridge courses to Open edX Studio imports.",
     entry_points={"console_scripts": ["cc2olx=cc2olx.main:main"]},
     install_requires=load_requirements("requirements/base.in"),
     license="GNU Affero General Public License",

diff --git a/src/cc2olx/constants.py b/src/cc2olx/constants.py
@@ -1,3 +1,9 @@
-CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
 OLX_STATIC_DIR = "static"
 OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}"
+WEB_RESOURCES_DIR_NAME = "web_resources"
+
+LINK_HTML = "<a href='{url}'>{text}</a>"
+YOUTUBE_LINK_PATTERN = r"youtube.com/watch\?v=(?P<video_id>[-\w]+)"
+CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
+
+QTI_RESPROCESSING_TYPES = ["general_fb", "correct_fb", "general_incorrect_fb"]
diff --git a/src/cc2olx/content_parsers/__init__.py b/src/cc2olx/content_parsers/__init__.py
@@ -0,0 +1,15 @@
+from cc2olx.content_parsers.abc import AbstractContentParser
+from cc2olx.content_parsers.discussion import DiscussionContentParser
+from cc2olx.content_parsers.html import HtmlContentParser
+from cc2olx.content_parsers.lti import LtiContentParser
+from cc2olx.content_parsers.qti import QtiContentParser
+from cc2olx.content_parsers.video import VideoContentParser
+
+__all__ = [
+    "AbstractContentParser",
+    "DiscussionContentParser",
+    "HtmlContentParser",
+    "LtiContentParser",
+    "QtiContentParser",
+    "VideoContentParser",
+]
diff --git a/src/cc2olx/content_parsers/abc.py b/src/cc2olx/content_parsers/abc.py
@@ -0,0 +1,31 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+
+from cc2olx.content_parsers.utils import StaticLinkProcessor
+from cc2olx.dataclasses import ContentParserContext
+from cc2olx.models import Cartridge
+
+
+class AbstractContentParser(ABC):
+    """
+    Abstract base class for parsing Common Cartridge content.
+    """
+
+    def __init__(self, cartridge: Cartridge, context: ContentParserContext) -> None:
+        self._cartridge = cartridge
+        self._context = context
+
+    def parse(self, idref: Optional[str]) -> Optional[Union[list, dict]]:
+        """
+        Parse the resource with the specified identifier.
+        """
+        if content := self._parse_content(idref):
+            link_processor = StaticLinkProcessor(self._cartridge, self._context.relative_links_source)
+            content = link_processor.process_content_static_links(content)
+        return content
+
+    @abstractmethod
+    def _parse_content(self, idref: Optional[str]) -> Optional[Union[list, dict]]:
+        """
+        Parse content of the resource with the specified identifier.
+        """
diff --git a/src/cc2olx/content_parsers/discussion.py b/src/cc2olx/content_parsers/discussion.py
@@ -0,0 +1,44 @@
+import re
+from typing import Dict, Optional
+
+from cc2olx import filesystem
+from cc2olx.content_parsers import AbstractContentParser
+from cc2olx.enums import CommonCartridgeResourceType
+from cc2olx.models import ResourceFile
+
+
+class DiscussionContentParser(AbstractContentParser):
+    """
+    Discussion resource content parser.
+    """
+
+    def _parse_content(self, idref: Optional[str]) -> Optional[Dict[str, str]]:
+        if idref:
+            if resource := self._cartridge.define_resource(idref):
+                if re.match(CommonCartridgeResourceType.DISCUSSION_TOPIC, resource["type"]):
+                    return self._parse_discussion(resource)
+        return None
+
+    def _parse_discussion(self, resource: dict) -> Dict[str, str]:
+        """
+        Parse the discussion content.
+        """
+        data = {}
+
+        for child in resource["children"]:
+            if isinstance(child, ResourceFile):
+                data.update(self._parse_resource_file_data(child, resource["type"]))
+
+        return data
+
+    def _parse_resource_file_data(self, resource_file: ResourceFile, resource_type: str) -> Dict[str, str]:
+        """
+        Parse the discussion resource file.
+        """
+        tree = filesystem.get_xml_tree(self._cartridge.build_resource_file_path(resource_file.href))
+        root = tree.getroot()
+
+        return {
+            "title": root.get_title(resource_type).text,
+            "text": root.get_text(resource_type).text,
+        }
diff --git a/src/cc2olx/content_parsers/html.py b/src/cc2olx/content_parsers/html.py
@@ -0,0 +1,140 @@
+import imghdr
+import logging
+import re
+from pathlib import Path
+from typing import Dict, Optional
+
+from django.conf import settings
+
+from cc2olx.constants import LINK_HTML, OLX_STATIC_PATH_TEMPLATE, WEB_RESOURCES_DIR_NAME
+from cc2olx.content_parsers import AbstractContentParser
+from cc2olx.content_parsers.mixins import WebLinkParserMixin
+from cc2olx.enums import CommonCartridgeResourceType
+
+logger = logging.getLogger()
+
+HTML_FILENAME_SUFFIX = ".html"
+
+
+class HtmlContentParser(WebLinkParserMixin, AbstractContentParser):
+    """
+    HTML resource content parser.
+    """
+
+    DEFAULT_CONTENT = {"html": "<p>MISSING CONTENT</p>"}
+
+    def _parse_content(self, idref: Optional[str]) -> Dict[str, str]:
+        if idref:
+            resource = self._cartridge.define_resource(idref)
+            if resource is None:
+                logger.info("Missing resource: %s", idref)
+                content = self.DEFAULT_CONTENT
+            elif resource["type"] == CommonCartridgeResourceType.WEB_CONTENT:
+                content = self._parse_webcontent(idref, resource)
+            elif web_link_content := self._parse_web_link_content(resource):
+                content = self._transform_web_link_content_to_html(web_link_content)
+            elif self.is_known_unprocessed_resource_type(resource["type"]):
+                content = self.DEFAULT_CONTENT
+            else:
+                content = self._parse_not_imported_content(resource)
+            return content
+        return self.DEFAULT_CONTENT
+
+    def _parse_webcontent(self, idref: str, resource: dict) -> Dict[str, str]:
+        """
+        Parse the resource with "webcontent" type.
+        """
+        resource_file = resource["children"][0]
+        resource_relative_link = resource_file.href
+        resource_file_path = self._cartridge.build_resource_file_path(resource_relative_link)
+
+        if resource_file_path.suffix == HTML_FILENAME_SUFFIX:
+            content = self._parse_webcontent_html_file(idref, resource_file_path)
+        elif WEB_RESOURCES_DIR_NAME in str(resource_file_path) and imghdr.what(str(resource_file_path)):
+            content = self._parse_image_webcontent_from_web_resources_dir(resource_file_path)
+        elif WEB_RESOURCES_DIR_NAME not in str(resource_file_path):
+            content = self._parse_webcontent_outside_web_resources_dir(resource_relative_link)
+        else:
+            logger.info("Skipping webcontent: %s", resource_file_path)
+            content = self.DEFAULT_CONTENT
+
+        return content
+
+    @staticmethod
+    def _parse_webcontent_html_file(idref: str, resource_file_path: Path) -> Dict[str, str]:
+        """
+        Parse webcontent HTML file.
+        """
+        try:
+            with open(resource_file_path, encoding="utf-8") as resource_file:
+                html = resource_file.read()
+        except:  # noqa: E722
+            logger.error("Failure reading %s from id %s", resource_file_path, idref)  # noqa: E722
+            raise
+        return {"html": html}
+
+    def _parse_image_webcontent_from_web_resources_dir(self, resource_file_path: Path) -> Dict[str, str]:
+        """
+        Parse webcontent image from "web_resources" directory.
+        """
+        static_filename = str(resource_file_path).split(f"{WEB_RESOURCES_DIR_NAME}/")[1]
+        olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
+        self._cartridge.olx_to_original_static_file_paths.add_web_resource_path(olx_static_path, resource_file_path)
+        image_webcontent_tpl_path = settings.TEMPLATES_DIR / "image_webcontent.html"
+
+        with open(image_webcontent_tpl_path, encoding="utf-8") as image_webcontent_tpl:
+            tpl_content = image_webcontent_tpl.read()
+            html = tpl_content.format(olx_static_path=olx_static_path, static_filename=static_filename)
+
+        return {"html": html}
+
+    def _parse_webcontent_outside_web_resources_dir(self, resource_relative_path: str) -> Dict[str, str]:
+        """
+        Parse webcontent located outside "web_resources" directory.
+        """
+        # This webcontent is outside ``web_resources`` directory
+        # So we need to manually copy it to OLX_STATIC_DIR
+        olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=resource_relative_path)
+        self._cartridge.olx_to_original_static_file_paths.add_extra_path(olx_static_path, resource_relative_path)
+        external_webcontent_tpl_path = settings.TEMPLATES_DIR / "external_webcontent.html"
+
+        with open(external_webcontent_tpl_path, encoding="utf-8") as external_webcontent_tpl:
+            tpl_content = external_webcontent_tpl.read()
+            html = tpl_content.format(olx_static_path=olx_static_path, resource_relative_path=resource_relative_path)
+
+        return {"html": html}
+
+    @staticmethod
+    def _transform_web_link_content_to_html(web_link_content: Dict[str, str]) -> Dict[str, str]:
+        """
+        Generate HTML for weblink.
+        """
+        video_link_html = LINK_HTML.format(url=web_link_content["href"], text=web_link_content.get("text", ""))
+        return {"html": video_link_html}
+
+    @staticmethod
+    def is_known_unprocessed_resource_type(resource_type: str) -> bool:
+        """
+        Decides whether the resource type is a known CC type to be unprocessed.
+        """
+        return any(
+            re.match(type_pattern, resource_type)
+            for type_pattern in (
+                CommonCartridgeResourceType.LTI_LINK,
+                CommonCartridgeResourceType.QTI_ASSESSMENT,
+                CommonCartridgeResourceType.DISCUSSION_TOPIC,
+            )
+        )
+
+    @staticmethod
+    def _parse_not_imported_content(resource: dict) -> Dict[str, str]:
+        """
+        Parse the resource which content type cannot be processed.
+        """
+        resource_type = resource["type"]
+        text = f"Not imported content: type = {resource_type!r}"
+        if "href" in resource:
+            text += ", href = {!r}".format(resource["href"])
+
+        logger.info("%s", text)
+        return {"html": text}