Skip to content

Commit

Permalink
refactor: [FC-0063] Block type processing is refactored
Browse files Browse the repository at this point in the history
- `attrs` dependency is added
- block type processors are implemented
- block type processors are integrated into the script workflow
  • Loading branch information
myhailo-chernyshov-rg committed Jan 14, 2025
1 parent 1bfea42 commit 1650432
Show file tree
Hide file tree
Showing 59 changed files with 2,714 additions and 1,505 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include LICENSE
include README.rst

recursive-include src/cc2olx/templates *
recursive-include requirements *
recursive-include tests *
recursive-exclude * __pycache__
Expand Down
2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[pytest]
usefixtures = chdir_to_workspace
DJANGO_SETTINGS_MODULE = cc2olx.django_settings
DJANGO_SETTINGS_MODULE = cc2olx.settings
1 change: 1 addition & 0 deletions requirements/base.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Core requirements for this package

Django
attrs
lxml
requests
youtube-dl
2 changes: 2 additions & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#
asgiref==3.8.1
# via django
attrs==24.3.0
# via -r requirements/base.in
backports-zoneinfo==0.2.1
# via django
certifi==2024.12.14
Expand Down
4 changes: 4 additions & 0 deletions requirements/ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ asgiref==3.8.1
# via
# -r /home/misha/work/cc2olx/requirements/quality.txt
# django
attrs==24.3.0
# via
# -c /home/misha/work/cc2olx/requirements/constraints.txt
# -r /home/misha/work/cc2olx/requirements/quality.txt
backports-zoneinfo==0.2.1
# via
# -r /home/misha/work/cc2olx/requirements/quality.txt
Expand Down
2 changes: 2 additions & 0 deletions requirements/constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
# link to other information that will help people in the future to remove the
# pin when possible. Writing an issue against the offending project and
# linking to it here is good.

attrs==24.3.0
5 changes: 5 additions & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ asgiref==3.8.1
# -r /home/misha/work/cc2olx/requirements/ci.txt
# -r /home/misha/work/cc2olx/requirements/quality.txt
# django
attrs==24.3.0
# via
# -c /home/misha/work/cc2olx/requirements/constraints.txt
# -r /home/misha/work/cc2olx/requirements/ci.txt
# -r /home/misha/work/cc2olx/requirements/quality.txt
backports-tarfile==1.2.0
# via jaraco-context
backports-zoneinfo==0.2.1
Expand Down
4 changes: 4 additions & 0 deletions requirements/quality.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ asgiref==3.8.1
# via
# -r /home/misha/work/cc2olx/requirements/test.txt
# django
attrs==24.3.0
# via
# -c /home/misha/work/cc2olx/requirements/constraints.txt
# -r /home/misha/work/cc2olx/requirements/test.txt
backports-zoneinfo==0.2.1
# via
# -r /home/misha/work/cc2olx/requirements/test.txt
Expand Down
4 changes: 4 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ asgiref==3.8.1
# via
# -r /home/misha/work/cc2olx/requirements/base.txt
# django
attrs==24.3.0
# via
# -c /home/misha/work/cc2olx/requirements/constraints.txt
# -r /home/misha/work/cc2olx/requirements/base.txt
backports-zoneinfo==0.2.1
# via
# -r /home/misha/work/cc2olx/requirements/base.txt
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"Programming Language :: Python :: 3.8",
"Topic :: Utilities",
],
description=("Command line tool, that converts Common Cartridge " "courses to Open edX Studio imports."),
description="Command line tool, that converts Common Cartridge courses to Open edX Studio imports.",
entry_points={"console_scripts": ["cc2olx=cc2olx.main:main"]},
install_requires=load_requirements("requirements/base.in"),
license="GNU Affero General Public License",
Expand Down
8 changes: 7 additions & 1 deletion src/cc2olx/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
OLX_STATIC_DIR = "static"
OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}"
WEB_RESOURCES_DIR_NAME = "web_resources"

LINK_HTML = "<a href='{url}'>{text}</a>"
YOUTUBE_LINK_PATTERN = r"youtube.com/watch\?v=(?P<video_id>[-\w]+)"
CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"

QTI_RESPROCESSING_TYPES = ["general_fb", "correct_fb", "general_incorrect_fb"]
15 changes: 15 additions & 0 deletions src/cc2olx/content_parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from cc2olx.content_parsers.abc import AbstractContentParser
from cc2olx.content_parsers.discussion import DiscussionContentParser
from cc2olx.content_parsers.html import HtmlContentParser
from cc2olx.content_parsers.lti import LtiContentParser
from cc2olx.content_parsers.qti import QtiContentParser
from cc2olx.content_parsers.video import VideoContentParser

__all__ = [
"AbstractContentParser",
"DiscussionContentParser",
"HtmlContentParser",
"LtiContentParser",
"QtiContentParser",
"VideoContentParser",
]
31 changes: 31 additions & 0 deletions src/cc2olx/content_parsers/abc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from abc import ABC, abstractmethod
from typing import Optional, Union

from cc2olx.content_parsers.utils import StaticLinkProcessor
from cc2olx.dataclasses import ContentParserContext
from cc2olx.models import Cartridge


class AbstractContentParser(ABC):
"""
Abstract base class for parsing Common Cartridge content.
"""

def __init__(self, cartridge: Cartridge, context: ContentParserContext) -> None:
self._cartridge = cartridge
self._context = context

def parse(self, idref: Optional[str]) -> Optional[Union[list, dict]]:
"""
Parse the resource with the specified identifier.
"""
if content := self._parse_content(idref):
link_processor = StaticLinkProcessor(self._cartridge, self._context.relative_links_source)
content = link_processor.process_content_static_links(content)
return content

@abstractmethod
def _parse_content(self, idref: Optional[str]) -> Optional[Union[list, dict]]:
"""
Parse content of the resource with the specified identifier.
"""
44 changes: 44 additions & 0 deletions src/cc2olx/content_parsers/discussion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import re
from typing import Dict, Optional

from cc2olx import filesystem
from cc2olx.content_parsers import AbstractContentParser
from cc2olx.enums import CommonCartridgeResourceType
from cc2olx.models import ResourceFile


class DiscussionContentParser(AbstractContentParser):
"""
Discussion resource content parser.
"""

def _parse_content(self, idref: Optional[str]) -> Optional[Dict[str, str]]:
if idref:
if resource := self._cartridge.define_resource(idref):
if re.match(CommonCartridgeResourceType.DISCUSSION_TOPIC, resource["type"]):
return self._parse_discussion(resource)
return None

def _parse_discussion(self, resource: dict) -> Dict[str, str]:
"""
Parse the discussion content.
"""
data = {}

for child in resource["children"]:
if isinstance(child, ResourceFile):
data.update(self._parse_resource_file_data(child, resource["type"]))

return data

def _parse_resource_file_data(self, resource_file: ResourceFile, resource_type: str) -> Dict[str, str]:
"""
Parse the discussion resource file.
"""
tree = filesystem.get_xml_tree(self._cartridge.build_resource_file_path(resource_file.href))
root = tree.getroot()

return {
"title": root.get_title(resource_type).text,
"text": root.get_text(resource_type).text,
}
140 changes: 140 additions & 0 deletions src/cc2olx/content_parsers/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import imghdr
import logging
import re
from pathlib import Path
from typing import Dict, Optional

from django.conf import settings

from cc2olx.constants import LINK_HTML, OLX_STATIC_PATH_TEMPLATE, WEB_RESOURCES_DIR_NAME
from cc2olx.content_parsers import AbstractContentParser
from cc2olx.content_parsers.mixins import WebLinkParserMixin
from cc2olx.enums import CommonCartridgeResourceType

logger = logging.getLogger()

HTML_FILENAME_SUFFIX = ".html"


class HtmlContentParser(WebLinkParserMixin, AbstractContentParser):
"""
HTML resource content parser.
"""

DEFAULT_CONTENT = {"html": "<p>MISSING CONTENT</p>"}

def _parse_content(self, idref: Optional[str]) -> Dict[str, str]:
if idref:
resource = self._cartridge.define_resource(idref)
if resource is None:
logger.info("Missing resource: %s", idref)
content = self.DEFAULT_CONTENT
elif resource["type"] == CommonCartridgeResourceType.WEB_CONTENT:
content = self._parse_webcontent(idref, resource)
elif web_link_content := self._parse_web_link_content(resource):
content = self._transform_web_link_content_to_html(web_link_content)
elif self.is_known_unprocessed_resource_type(resource["type"]):
content = self.DEFAULT_CONTENT
else:
content = self._parse_not_imported_content(resource)
return content
return self.DEFAULT_CONTENT

def _parse_webcontent(self, idref: str, resource: dict) -> Dict[str, str]:
"""
Parse the resource with "webcontent" type.
"""
resource_file = resource["children"][0]
resource_relative_link = resource_file.href
resource_file_path = self._cartridge.build_resource_file_path(resource_relative_link)

if resource_file_path.suffix == HTML_FILENAME_SUFFIX:
content = self._parse_webcontent_html_file(idref, resource_file_path)
elif WEB_RESOURCES_DIR_NAME in str(resource_file_path) and imghdr.what(str(resource_file_path)):
content = self._parse_image_webcontent_from_web_resources_dir(resource_file_path)
elif WEB_RESOURCES_DIR_NAME not in str(resource_file_path):
content = self._parse_webcontent_outside_web_resources_dir(resource_relative_link)
else:
logger.info("Skipping webcontent: %s", resource_file_path)
content = self.DEFAULT_CONTENT

return content

@staticmethod
def _parse_webcontent_html_file(idref: str, resource_file_path: Path) -> Dict[str, str]:
"""
Parse webcontent HTML file.
"""
try:
with open(resource_file_path, encoding="utf-8") as resource_file:
html = resource_file.read()
except: # noqa: E722
logger.error("Failure reading %s from id %s", resource_file_path, idref) # noqa: E722
raise
return {"html": html}

def _parse_image_webcontent_from_web_resources_dir(self, resource_file_path: Path) -> Dict[str, str]:
"""
Parse webcontent image from "web_resources" directory.
"""
static_filename = str(resource_file_path).split(f"{WEB_RESOURCES_DIR_NAME}/")[1]
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
self._cartridge.olx_to_original_static_file_paths.add_web_resource_path(olx_static_path, resource_file_path)
image_webcontent_tpl_path = settings.TEMPLATES_DIR / "image_webcontent.html"

with open(image_webcontent_tpl_path, encoding="utf-8") as image_webcontent_tpl:
tpl_content = image_webcontent_tpl.read()
html = tpl_content.format(olx_static_path=olx_static_path, static_filename=static_filename)

return {"html": html}

def _parse_webcontent_outside_web_resources_dir(self, resource_relative_path: str) -> Dict[str, str]:
"""
Parse webcontent located outside "web_resources" directory.
"""
# This webcontent is outside ``web_resources`` directory
# So we need to manually copy it to OLX_STATIC_DIR
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=resource_relative_path)
self._cartridge.olx_to_original_static_file_paths.add_extra_path(olx_static_path, resource_relative_path)
external_webcontent_tpl_path = settings.TEMPLATES_DIR / "external_webcontent.html"

with open(external_webcontent_tpl_path, encoding="utf-8") as external_webcontent_tpl:
tpl_content = external_webcontent_tpl.read()
html = tpl_content.format(olx_static_path=olx_static_path, resource_relative_path=resource_relative_path)

return {"html": html}

@staticmethod
def _transform_web_link_content_to_html(web_link_content: Dict[str, str]) -> Dict[str, str]:
"""
Generate HTML for weblink.
"""
video_link_html = LINK_HTML.format(url=web_link_content["href"], text=web_link_content.get("text", ""))
return {"html": video_link_html}

@staticmethod
def is_known_unprocessed_resource_type(resource_type: str) -> bool:
"""
Decides whether the resource type is a known CC type to be unprocessed.
"""
return any(
re.match(type_pattern, resource_type)
for type_pattern in (
CommonCartridgeResourceType.LTI_LINK,
CommonCartridgeResourceType.QTI_ASSESSMENT,
CommonCartridgeResourceType.DISCUSSION_TOPIC,
)
)

@staticmethod
def _parse_not_imported_content(resource: dict) -> Dict[str, str]:
"""
Parse the resource which content type cannot be processed.
"""
resource_type = resource["type"]
text = f"Not imported content: type = {resource_type!r}"
if "href" in resource:
text += ", href = {!r}".format(resource["href"])

logger.info("%s", text)
return {"html": text}
Loading

0 comments on commit 1650432

Please sign in to comment.