forked from openedx/cc2olx
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: [FC-0063] Block type processing is refactored
- `attrs` dependency is added - block type processors are implemented - block type processors are integrated into the script workflow
- Loading branch information
1 parent
1bfea42
commit 1650432
Showing
59 changed files
with
2,714 additions
and
1,505 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
[pytest] | ||
usefixtures = chdir_to_workspace | ||
DJANGO_SETTINGS_MODULE = cc2olx.django_settings | ||
DJANGO_SETTINGS_MODULE = cc2olx.settings |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
# Core requirements for this package | ||
|
||
Django | ||
attrs | ||
lxml | ||
requests | ||
youtube-dl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,9 @@ | ||
CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>" | ||
OLX_STATIC_DIR = "static" | ||
OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}" | ||
WEB_RESOURCES_DIR_NAME = "web_resources" | ||
|
||
LINK_HTML = "<a href='{url}'>{text}</a>" | ||
YOUTUBE_LINK_PATTERN = r"youtube.com/watch\?v=(?P<video_id>[-\w]+)" | ||
CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>" | ||
|
||
QTI_RESPROCESSING_TYPES = ["general_fb", "correct_fb", "general_incorrect_fb"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from cc2olx.content_parsers.abc import AbstractContentParser | ||
from cc2olx.content_parsers.discussion import DiscussionContentParser | ||
from cc2olx.content_parsers.html import HtmlContentParser | ||
from cc2olx.content_parsers.lti import LtiContentParser | ||
from cc2olx.content_parsers.qti import QtiContentParser | ||
from cc2olx.content_parsers.video import VideoContentParser | ||
|
||
__all__ = [ | ||
"AbstractContentParser", | ||
"DiscussionContentParser", | ||
"HtmlContentParser", | ||
"LtiContentParser", | ||
"QtiContentParser", | ||
"VideoContentParser", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Optional, Union | ||
|
||
from cc2olx.content_parsers.utils import StaticLinkProcessor | ||
from cc2olx.dataclasses import ContentParserContext | ||
from cc2olx.models import Cartridge | ||
|
||
|
||
class AbstractContentParser(ABC): | ||
""" | ||
Abstract base class for parsing Common Cartridge content. | ||
""" | ||
|
||
def __init__(self, cartridge: Cartridge, context: ContentParserContext) -> None: | ||
self._cartridge = cartridge | ||
self._context = context | ||
|
||
def parse(self, idref: Optional[str]) -> Optional[Union[list, dict]]: | ||
""" | ||
Parse the resource with the specified identifier. | ||
""" | ||
if content := self._parse_content(idref): | ||
link_processor = StaticLinkProcessor(self._cartridge, self._context.relative_links_source) | ||
content = link_processor.process_content_static_links(content) | ||
return content | ||
|
||
@abstractmethod | ||
def _parse_content(self, idref: Optional[str]) -> Optional[Union[list, dict]]: | ||
""" | ||
Parse content of the resource with the specified identifier. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import re | ||
from typing import Dict, Optional | ||
|
||
from cc2olx import filesystem | ||
from cc2olx.content_parsers import AbstractContentParser | ||
from cc2olx.enums import CommonCartridgeResourceType | ||
from cc2olx.models import ResourceFile | ||
|
||
|
||
class DiscussionContentParser(AbstractContentParser): | ||
""" | ||
Discussion resource content parser. | ||
""" | ||
|
||
def _parse_content(self, idref: Optional[str]) -> Optional[Dict[str, str]]: | ||
if idref: | ||
if resource := self._cartridge.define_resource(idref): | ||
if re.match(CommonCartridgeResourceType.DISCUSSION_TOPIC, resource["type"]): | ||
return self._parse_discussion(resource) | ||
return None | ||
|
||
def _parse_discussion(self, resource: dict) -> Dict[str, str]: | ||
""" | ||
Parse the discussion content. | ||
""" | ||
data = {} | ||
|
||
for child in resource["children"]: | ||
if isinstance(child, ResourceFile): | ||
data.update(self._parse_resource_file_data(child, resource["type"])) | ||
|
||
return data | ||
|
||
def _parse_resource_file_data(self, resource_file: ResourceFile, resource_type: str) -> Dict[str, str]: | ||
""" | ||
Parse the discussion resource file. | ||
""" | ||
tree = filesystem.get_xml_tree(self._cartridge.build_resource_file_path(resource_file.href)) | ||
root = tree.getroot() | ||
|
||
return { | ||
"title": root.get_title(resource_type).text, | ||
"text": root.get_text(resource_type).text, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import imghdr | ||
import logging | ||
import re | ||
from pathlib import Path | ||
from typing import Dict, Optional | ||
|
||
from django.conf import settings | ||
|
||
from cc2olx.constants import LINK_HTML, OLX_STATIC_PATH_TEMPLATE, WEB_RESOURCES_DIR_NAME | ||
from cc2olx.content_parsers import AbstractContentParser | ||
from cc2olx.content_parsers.mixins import WebLinkParserMixin | ||
from cc2olx.enums import CommonCartridgeResourceType | ||
|
||
logger = logging.getLogger() | ||
|
||
HTML_FILENAME_SUFFIX = ".html" | ||
|
||
|
||
class HtmlContentParser(WebLinkParserMixin, AbstractContentParser): | ||
""" | ||
HTML resource content parser. | ||
""" | ||
|
||
DEFAULT_CONTENT = {"html": "<p>MISSING CONTENT</p>"} | ||
|
||
def _parse_content(self, idref: Optional[str]) -> Dict[str, str]: | ||
if idref: | ||
resource = self._cartridge.define_resource(idref) | ||
if resource is None: | ||
logger.info("Missing resource: %s", idref) | ||
content = self.DEFAULT_CONTENT | ||
elif resource["type"] == CommonCartridgeResourceType.WEB_CONTENT: | ||
content = self._parse_webcontent(idref, resource) | ||
elif web_link_content := self._parse_web_link_content(resource): | ||
content = self._transform_web_link_content_to_html(web_link_content) | ||
elif self.is_known_unprocessed_resource_type(resource["type"]): | ||
content = self.DEFAULT_CONTENT | ||
else: | ||
content = self._parse_not_imported_content(resource) | ||
return content | ||
return self.DEFAULT_CONTENT | ||
|
||
def _parse_webcontent(self, idref: str, resource: dict) -> Dict[str, str]: | ||
""" | ||
Parse the resource with "webcontent" type. | ||
""" | ||
resource_file = resource["children"][0] | ||
resource_relative_link = resource_file.href | ||
resource_file_path = self._cartridge.build_resource_file_path(resource_relative_link) | ||
|
||
if resource_file_path.suffix == HTML_FILENAME_SUFFIX: | ||
content = self._parse_webcontent_html_file(idref, resource_file_path) | ||
elif WEB_RESOURCES_DIR_NAME in str(resource_file_path) and imghdr.what(str(resource_file_path)): | ||
content = self._parse_image_webcontent_from_web_resources_dir(resource_file_path) | ||
elif WEB_RESOURCES_DIR_NAME not in str(resource_file_path): | ||
content = self._parse_webcontent_outside_web_resources_dir(resource_relative_link) | ||
else: | ||
logger.info("Skipping webcontent: %s", resource_file_path) | ||
content = self.DEFAULT_CONTENT | ||
|
||
return content | ||
|
||
@staticmethod | ||
def _parse_webcontent_html_file(idref: str, resource_file_path: Path) -> Dict[str, str]: | ||
""" | ||
Parse webcontent HTML file. | ||
""" | ||
try: | ||
with open(resource_file_path, encoding="utf-8") as resource_file: | ||
html = resource_file.read() | ||
except: # noqa: E722 | ||
logger.error("Failure reading %s from id %s", resource_file_path, idref) # noqa: E722 | ||
raise | ||
return {"html": html} | ||
|
||
def _parse_image_webcontent_from_web_resources_dir(self, resource_file_path: Path) -> Dict[str, str]: | ||
""" | ||
Parse webcontent image from "web_resources" directory. | ||
""" | ||
static_filename = str(resource_file_path).split(f"{WEB_RESOURCES_DIR_NAME}/")[1] | ||
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename) | ||
self._cartridge.olx_to_original_static_file_paths.add_web_resource_path(olx_static_path, resource_file_path) | ||
image_webcontent_tpl_path = settings.TEMPLATES_DIR / "image_webcontent.html" | ||
|
||
with open(image_webcontent_tpl_path, encoding="utf-8") as image_webcontent_tpl: | ||
tpl_content = image_webcontent_tpl.read() | ||
html = tpl_content.format(olx_static_path=olx_static_path, static_filename=static_filename) | ||
|
||
return {"html": html} | ||
|
||
def _parse_webcontent_outside_web_resources_dir(self, resource_relative_path: str) -> Dict[str, str]: | ||
""" | ||
Parse webcontent located outside "web_resources" directory. | ||
""" | ||
# This webcontent is outside ``web_resources`` directory | ||
# So we need to manually copy it to OLX_STATIC_DIR | ||
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=resource_relative_path) | ||
self._cartridge.olx_to_original_static_file_paths.add_extra_path(olx_static_path, resource_relative_path) | ||
external_webcontent_tpl_path = settings.TEMPLATES_DIR / "external_webcontent.html" | ||
|
||
with open(external_webcontent_tpl_path, encoding="utf-8") as external_webcontent_tpl: | ||
tpl_content = external_webcontent_tpl.read() | ||
html = tpl_content.format(olx_static_path=olx_static_path, resource_relative_path=resource_relative_path) | ||
|
||
return {"html": html} | ||
|
||
@staticmethod | ||
def _transform_web_link_content_to_html(web_link_content: Dict[str, str]) -> Dict[str, str]: | ||
""" | ||
Generate HTML for weblink. | ||
""" | ||
video_link_html = LINK_HTML.format(url=web_link_content["href"], text=web_link_content.get("text", "")) | ||
return {"html": video_link_html} | ||
|
||
@staticmethod | ||
def is_known_unprocessed_resource_type(resource_type: str) -> bool: | ||
""" | ||
Decides whether the resource type is a known CC type to be unprocessed. | ||
""" | ||
return any( | ||
re.match(type_pattern, resource_type) | ||
for type_pattern in ( | ||
CommonCartridgeResourceType.LTI_LINK, | ||
CommonCartridgeResourceType.QTI_ASSESSMENT, | ||
CommonCartridgeResourceType.DISCUSSION_TOPIC, | ||
) | ||
) | ||
|
||
@staticmethod | ||
def _parse_not_imported_content(resource: dict) -> Dict[str, str]: | ||
""" | ||
Parse the resource which content type cannot be processed. | ||
""" | ||
resource_type = resource["type"] | ||
text = f"Not imported content: type = {resource_type!r}" | ||
if "href" in resource: | ||
text += ", href = {!r}".format(resource["href"]) | ||
|
||
logger.info("%s", text) | ||
return {"html": text} |
Oops, something went wrong.