Skip to content

Commit

Permalink
refactor: CC XML parsing moved to a separate module
Browse files Browse the repository at this point in the history
  • Loading branch information
myhailo-chernyshov-rg committed Jan 10, 2025
1 parent 1467d38 commit 0a321da
Show file tree
Hide file tree
Showing 14 changed files with 634 additions and 227 deletions.
3 changes: 0 additions & 3 deletions src/cc2olx/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
WEB_RESOURCES_DIR_NAME = "web_resources"

LINK_HTML = "<a href='{url}'>{text}</a>"
WEB_LINK_NAMESPACE = (
"http://www.imsglobal.org/xsd/imsccv{major_version}p{minor_version}/imswl_v{major_version}p{minor_version}"
)
YOUTUBE_LINK_PATTERN = r"youtube.com/watch\?v=(?P<video_id>[-\w]+)"
CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"

Expand Down
20 changes: 7 additions & 13 deletions src/cc2olx/content_parsers/discussion.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,11 @@ class DiscussionContentParser(AbstractContentParser):
Discussion resource content parser.
"""

NAMESPACES = {
"imsdt_xmlv1p1": "http://www.imsglobal.org/xsd/imsccv1p1/imsdt_v1p1",
"imsdt_xmlv1p2": "http://www.imsglobal.org/xsd/imsccv1p2/imsdt_v1p2",
"imsdt_xmlv1p3": "http://www.imsglobal.org/xsd/imsccv1p3/imsdt_v1p3",
}

def _parse_content(self, idref: Optional[str]) -> Optional[Dict[str, str]]:
if idref:
if resource := self._cartridge.define_resource(idref):
if re.match(CommonCartridgeResourceType.DISCUSSION_TOPIC, resource["type"]):
data = self._parse_discussion(resource)
return data
return self._parse_discussion(resource)
return None

def _parse_discussion(self, resource: dict) -> Dict[str, str]:
Expand All @@ -42,9 +35,10 @@ def _parse_resource_file_data(self, resource_file: ResourceFile, resource_type:
"""
Parse the discussion resource file.
"""
tree = filesystem.get_xml_tree(self._cartridge.build_res_file_path(resource_file.href))
tree = filesystem.get_xml_tree(self._cartridge.build_resource_file_path(resource_file.href))
root = tree.getroot()
ns = {"dt": self.NAMESPACES[resource_type]}
title = root.find("dt:title", ns).text
text = root.find("dt:text", ns).text
return {"title": title, "text": text}

return {
"title": root.get_title(resource_type).text,
"text": root.get_text(resource_type).text,
}
42 changes: 21 additions & 21 deletions src/cc2olx/content_parsers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,41 +43,41 @@ def _parse_webcontent(self, idref: str, resource: dict) -> Dict[str, str]:
"""
Parse the resource with "webcontent" type.
"""
res_file = resource["children"][0]
res_relative_link = res_file.href
res_file_path = self._cartridge.build_res_file_path(res_relative_link)

if res_file_path.suffix == HTML_FILENAME_SUFFIX:
content = self._parse_webcontent_html_file(idref, res_file_path)
elif WEB_RESOURCES_DIR_NAME in str(res_file_path) and imghdr.what(str(res_file_path)):
content = self._parse_image_webcontent_from_web_resources_dir(res_file_path)
elif WEB_RESOURCES_DIR_NAME not in str(res_file_path):
content = self._parse_webcontent_outside_web_resources_dir(res_relative_link)
resource_file = resource["children"][0]
resource_relative_link = resource_file.href
resource_file_path = self._cartridge.build_resource_file_path(resource_relative_link)

if resource_file_path.suffix == HTML_FILENAME_SUFFIX:
content = self._parse_webcontent_html_file(idref, resource_file_path)
elif WEB_RESOURCES_DIR_NAME in str(resource_file_path) and imghdr.what(str(resource_file_path)):
content = self._parse_image_webcontent_from_web_resources_dir(resource_file_path)
elif WEB_RESOURCES_DIR_NAME not in str(resource_file_path):
content = self._parse_webcontent_outside_web_resources_dir(resource_relative_link)
else:
logger.info("Skipping webcontent: %s", res_file_path)
logger.info("Skipping webcontent: %s", resource_file_path)
content = self.DEFAULT_CONTENT

return content

@staticmethod
def _parse_webcontent_html_file(idref: str, res_file_path: Path) -> Dict[str, str]:
def _parse_webcontent_html_file(idref: str, resource_file_path: Path) -> Dict[str, str]:
"""
Parse webcontent HTML file.
"""
try:
with open(res_file_path, encoding="utf-8") as res_file:
html = res_file.read()
with open(resource_file_path, encoding="utf-8") as resource_file:
html = resource_file.read()
except: # noqa: E722
logger.error("Failure reading %s from id %s", res_file_path, idref) # noqa: E722
logger.error("Failure reading %s from id %s", resource_file_path, idref) # noqa: E722
raise
return {"html": html}

@staticmethod
def _parse_image_webcontent_from_web_resources_dir(res_file_path: Path) -> Dict[str, str]:
def _parse_image_webcontent_from_web_resources_dir(resource_file_path: Path) -> Dict[str, str]:
"""
Parse webcontent image from "web_resources" directory.
"""
static_filename = str(res_file_path).split(f"{WEB_RESOURCES_DIR_NAME}/")[1]
static_filename = str(resource_file_path).split(f"{WEB_RESOURCES_DIR_NAME}/")[1]
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
image_webcontent_tpl_path = settings.TEMPLATES_DIR / "image_webcontent.html"

Expand All @@ -87,19 +87,19 @@ def _parse_image_webcontent_from_web_resources_dir(res_file_path: Path) -> Dict[

return {"html": html}

def _parse_webcontent_outside_web_resources_dir(self, res_relative_path: str) -> Dict[str, str]:
def _parse_webcontent_outside_web_resources_dir(self, resource_relative_path: str) -> Dict[str, str]:
"""
Parse webcontent located outside "web_resources" directory.
"""
# This webcontent is outside ``web_resources`` directory
# So we need to manually copy it to OLX_STATIC_DIR
self._cartridge.add_extra_static_file(res_relative_path)
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=res_relative_path)
self._cartridge.add_extra_static_file(resource_relative_path)
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=resource_relative_path)
external_webcontent_tpl_path = settings.TEMPLATES_DIR / "external_webcontent.html"

with open(external_webcontent_tpl_path, encoding="utf-8") as external_webcontent_tpl:
tpl_content = external_webcontent_tpl.read()
html = tpl_content.format(olx_static_path=olx_static_path, res_relative_path=res_relative_path)
html = tpl_content.format(olx_static_path=olx_static_path, resource_relative_path=resource_relative_path)

return {"html": html}

Expand Down
47 changes: 19 additions & 28 deletions src/cc2olx/content_parsers/lti.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,18 @@
import re
from typing import Dict, Optional

from lxml import etree

from cc2olx import filesystem
from cc2olx.content_parsers import AbstractContentParser
from cc2olx.enums import CommonCartridgeResourceType
from cc2olx.utils import simple_slug
from cc2olx.xml import cc_xml


class LtiContentParser(AbstractContentParser):
"""
LTI resource content parser.
"""

NAMESPACES = {
"blti": "http://www.imsglobal.org/xsd/imsbasiclti_v1p0",
"lticp": "http://www.imsglobal.org/xsd/imslticp_v1p0",
"lticm": "http://www.imsglobal.org/xsd/imslticm_v1p0",
}
DEFAULT_WIDTH = "500"
DEFAULT_HEIGHT = "500"

Expand All @@ -38,57 +32,54 @@ def _parse_lti(self, resource: dict) -> dict:
"""
Parse LTI resource.
"""
res_file = resource["children"][0]
res_file_path = self._cartridge.build_res_file_path(res_file.href)
tree = filesystem.get_xml_tree(res_file_path)
resource_file = resource["children"][0]
resource_file_path = self._cartridge.build_resource_file_path(resource_file.href)
tree = filesystem.get_xml_tree(resource_file_path)
root = tree.getroot()
title = root.find("blti:title", self.NAMESPACES).text
description = root.find("blti:description", self.NAMESPACES).text
data = {
title = root.title.text

return {
"title": title,
"description": description,
"description": root.description.text,
"launch_url": self._parse_launch_url(root),
"height": self._parse_height(root),
"width": self._parse_width(root),
"custom_parameters": self._parse_custom_parameters(root),
"lti_id": self._parse_lti_id(root, title),
}
return data

def _parse_launch_url(self, resource_root: etree._Element) -> str:
def _parse_launch_url(self, resource_root: cc_xml.BasicLtiLink) -> str:
"""
Parse URL to launch LTI.
"""
if (launch_url := resource_root.find("blti:secure_launch_url", self.NAMESPACES)) is None:
launch_url = resource_root.find("blti:launch_url", self.NAMESPACES)
if (launch_url := resource_root.secure_launch_url) is None:
launch_url = resource_root.launch_url
return getattr(launch_url, "text", "")

def _parse_width(self, resource_root: etree._Element) -> str:
def _parse_width(self, resource_root: cc_xml.BasicLtiLink) -> str:
"""
Parse width.
"""
width = resource_root.find("blti:extensions/lticm:property[@name='selection_width']", self.NAMESPACES)
return getattr(width, "text", self.DEFAULT_WIDTH)
return getattr(resource_root.width, "text", self.DEFAULT_WIDTH)

def _parse_height(self, resource_root: etree._Element) -> str:
def _parse_height(self, resource_root: cc_xml.BasicLtiLink) -> str:
"""
Parse height.
"""
height = resource_root.find("blti:extensions/lticm:property[@name='selection_height']", self.NAMESPACES)
return getattr(height, "text", self.DEFAULT_HEIGHT)
return getattr(resource_root.height, "text", self.DEFAULT_HEIGHT)

def _parse_custom_parameters(self, resource_root: etree._Element) -> Dict[str, str]:
def _parse_custom_parameters(self, resource_root: cc_xml.BasicLtiLink) -> Dict[str, str]:
"""
Parse custom parameters.
"""
custom = resource_root.find("blti:custom", self.NAMESPACES)
custom = resource_root.custom
return {} if custom is None else {option.get("name"): option.text for option in custom}

def _parse_lti_id(self, resource_root: etree._Element, title: str) -> str:
def _parse_lti_id(self, resource_root: cc_xml.BasicLtiLink, title: str) -> str:
"""
Parse LTI identifier.
For Canvas flavored CC, tool_id is used as lti_id if present.
"""
tool_id = resource_root.find("blti:extensions/lticm:property[@name='tool_id']", self.NAMESPACES)
tool_id = resource_root.canvas_tool_id
return simple_slug(title) if tool_id is None else tool_id.text
29 changes: 9 additions & 20 deletions src/cc2olx/content_parsers/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Dict, Optional

from cc2olx import filesystem
from cc2olx.constants import WEB_LINK_NAMESPACE
from cc2olx.enums import CommonCartridgeResourceType
from cc2olx.models import Cartridge

Expand All @@ -18,24 +17,14 @@ def _parse_web_link_content(self, resource: dict) -> Optional[Dict[str, str]]:
"""
Provide Web Link resource data.
"""
if web_link_match := re.match(CommonCartridgeResourceType.WEB_LINK, resource["type"]):
res_file = resource["children"][0]
res_file_path = self._cartridge.build_res_file_path(res_file.href)
tree = filesystem.get_xml_tree(res_file_path)
resource_type = resource["type"]
if re.match(CommonCartridgeResourceType.WEB_LINK, resource_type):
resource_file = resource["children"][0]
resource_file_path = self._cartridge.build_resource_file_path(resource_file.href)
tree = filesystem.get_xml_tree(resource_file_path)
root = tree.getroot()
ns = self._build_web_link_namespace(web_link_match)
title = root.find("wl:title", ns).text
url = root.find("wl:url", ns).get("href")
return {"href": url, "text": title}
return {
"href": root.get_url(resource_type).get("href"),
"text": root.get_title(resource_type).text,
}
return None

@staticmethod
def _build_web_link_namespace(web_link_match: re.Match) -> Dict[str, str]:
"""
Build Web Link namespace.
"""
web_link = WEB_LINK_NAMESPACE.format(
major_version=web_link_match.group("major_version"),
minor_version=web_link_match.group("minor_version"),
)
return {"wl": web_link}
Loading

0 comments on commit 0a321da

Please sign in to comment.