Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Source specifying for relative links is allowed #232

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ The link map file can be supplied using `-f` or `--link_file`::

cc2olx -r zip -i <IMSCC_FILE> -f <CSV_FILE>

If the original course content contains relative links and the resources
(images, documents etc) the links point to are not included into the exported
course dump, you can specify their source using `-s` flag:

cc2olx -i <IMSCC_FILE> -s <RELATIVE_LINKS_SOURCE>

Test Data
---------
Expand Down
9 changes: 9 additions & 0 deletions src/cc2olx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from pathlib import Path

from cc2olx.validators.cli import LinkSourceValidator

RESULT_TYPE_FOLDER = "folder"
RESULT_TYPE_ZIP = "zip"

Expand Down Expand Up @@ -69,4 +71,11 @@ def parse_args(args=None):
"should contain consumer_id, consumer_key and consumer_secret."
),
)
parser.add_argument(
"-s",
"--relative_links_source",
nargs="?",
type=LinkSourceValidator(),
help="The relative links source in the format '<scheme>://<netloc>', e.g. 'https://example.com'",
)
return parser.parse_args(args)
2 changes: 2 additions & 0 deletions src/cc2olx/constants.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
OLX_STATIC_DIR = "static"
OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}"
18 changes: 18 additions & 0 deletions src/cc2olx/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dataclasses import dataclass, field
from collections import ChainMap
from typing import Dict


@dataclass
class OlxToOriginalStaticFilePaths:
"""
Provide OLX static file to Common cartridge static file mappings.
"""

# Static files from `web_resources` directory
web_resources: Dict[str, str] = field(default_factory=dict)
# Static files that are outside of `web_resources` directory, but still required
extra: Dict[str, str] = field(default_factory=dict)

def __post_init__(self) -> None:
self.all = ChainMap(self.extra, self.web_resources)
27 changes: 21 additions & 6 deletions src/cc2olx/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,25 @@
from cc2olx import filesystem
from cc2olx import olx
from cc2olx.cli import parse_args, RESULT_TYPE_FOLDER, RESULT_TYPE_ZIP
from cc2olx.models import Cartridge, OLX_STATIC_DIR
from cc2olx.constants import OLX_STATIC_DIR
from cc2olx.models import Cartridge
from cc2olx.settings import collect_settings


def convert_one_file(input_file, workspace, link_file=None, passport_file=None):
def convert_one_file(
input_file,
workspace,
link_file=None,
passport_file=None,
relative_links_source=None,
):
filesystem.create_directory(workspace)

cartridge = Cartridge(input_file, workspace)
cartridge.load_manifest_extracted()
cartridge.normalize()

olx_export = olx.OlxExport(cartridge, link_file, passport_file)
olx_export = olx.OlxExport(cartridge, link_file, passport_file, relative_links_source)
olx_filename = cartridge.directory.parent / (cartridge.directory.name + "-course.xml")
policy_filename = cartridge.directory.parent / "policy.json"

Expand All @@ -39,8 +46,8 @@ def convert_one_file(input_file, workspace, link_file=None, passport_file=None):

# Add static files that are outside of web_resources directory
file_list += [
(str(cartridge.directory / filepath), "/static/{}".format(filepath))
for filepath in cartridge.extra_static_files
(str(cartridge.directory / original_filepath), olx_static_path)
for olx_static_path, original_filepath in cartridge.olx_to_original_static_file_paths.extra.items()
]

filesystem.add_in_tar_gz(str(tgz_filename), file_list)
Expand All @@ -53,6 +60,7 @@ def main():
workspace = settings["workspace"]
link_file = settings["link_file"]
passport_file = settings["passport_file"]
relative_links_source = settings["relative_links_source"]

# setup logger
logging_config = settings["logging_config"]
Expand All @@ -64,7 +72,14 @@ def main():

for input_file in settings["input_files"]:
try:
convert_one_file(input_file, temp_workspace, link_file, passport_file)
convert_one_file(
input_file,
temp_workspace,
link_file,
passport_file,
relative_links_source,
)

except Exception:
logger.exception("Error while converting %s file", input_file)

Expand Down
28 changes: 7 additions & 21 deletions src/cc2olx/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import zipfile

from cc2olx import filesystem
from cc2olx.constants import OLX_STATIC_PATH_TEMPLATE
from cc2olx.dataclasses import OlxToOriginalStaticFilePaths
from cc2olx.external.canvas import ModuleMeta
from cc2olx.qti import QtiParser
from cc2olx.utils import clean_file_name
Expand All @@ -24,22 +26,6 @@
DIFFUSE_SHALLOW_SECTIONS = False
DIFFUSE_SHALLOW_SUBSECTIONS = True

OLX_STATIC_DIR = "static"

OLX_DIRECTORIES = [
"about",
"assets",
"chapter",
"course",
"html",
"info",
"policies",
"problem",
"sequential",
OLX_STATIC_DIR,
"vertical",
]


def is_leaf(container):
return "identifierref" in container
Expand Down Expand Up @@ -85,8 +71,7 @@ def __init__(self, cartridge_file, workspace):
self.is_canvas_flavor = False
self.module_meta = {}

# List of static files that are outside of `web_resources` directory, but still required
self.extra_static_files = []
self.olx_to_original_static_file_paths = OlxToOriginalStaticFilePaths()

self.workspace = workspace

Expand Down Expand Up @@ -343,17 +328,18 @@ def get_resource_content(self, identifier):
return "html", {"html": html}
elif "web_resources" in str(res_filename) and imghdr.what(str(res_filename)):
static_filename = str(res_filename).split("web_resources/")[1]
olx_static_path = "/{}/{}".format(OLX_STATIC_DIR, static_filename)
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
self.olx_to_original_static_file_paths.web_resources[olx_static_path] = static_filename
html = (
'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
'</head><body><p><img src="{}" alt="{}"></p></body></html>'.format(olx_static_path, static_filename)
)
return "html", {"html": html}
elif "web_resources" not in str(res_filename):
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=res_relative_path)
# This webcontent is outside of ``web_resources`` directory
# So we need to manually copy it to OLX_STATIC_DIR
self.extra_static_files.append(res_relative_path)
olx_static_path = "/{}/{}".format(OLX_STATIC_DIR, res_relative_path)
self.olx_to_original_static_file_paths.extra[olx_static_path] = res_relative_path
html = (
'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
'</head><body><p><a href="{}" alt="{}">{}<a></p></body></html>'.format(
Expand Down
22 changes: 21 additions & 1 deletion src/cc2olx/olx.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ class OlxExport:
QTI = "qti"
DISCUSSION = "discussion"

def __init__(self, cartridge, link_file=None, passport_file=None):
def __init__(self, cartridge, link_file=None, passport_file=None, relative_links_source=None):
self.cartridge = cartridge
self.doc = None
self.link_file = link_file
self.passport_file = passport_file
self.relative_links_source = relative_links_source
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add some validation for the format of relative_links_source.

Copy link
Contributor Author

@myhailo-chernyshov-rg myhailo-chernyshov-rg Dec 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Validation is added on the argument parsing step.

self.iframe_link_parser = None
if link_file:
self.iframe_link_parser = KalturaIframeLinkParser(self.link_file)
Expand Down Expand Up @@ -250,6 +251,23 @@ def process_external_tools_link(item, html):
html = html.replace(item, external_tool_url)
return html

def process_relative_external_links(item, html):
"""
Turn static file URLs outside OLX_STATIC_DIR into absolute URLs.

Allow to avoid a situation when the original course page links have
relative URLs, such URLs weren't included into the exported Common
Cartridge course file that causes broken URLs in the imported OeX
course. The function adds the origin source to URLs to make them
absolute ones.
"""
if self.relative_links_source is None or item in self.cartridge.olx_to_original_static_file_paths.all:
return html

url = urllib.parse.urljoin(self.relative_links_source, item)
html = html.replace(item, url)
return html

for _, item in items:
if "IMS-CC-FILEBASE" in item:
html = process_ims_cc_filebase(item, html)
Expand All @@ -259,6 +277,8 @@ def process_external_tools_link(item, html):
html = process_external_tools_link(item, html)
elif "CANVAS_OBJECT_REFERENCE" in item:
html = process_canvas_reference(item, html)
else:
html = process_relative_external_links(item, html)

return html

Expand Down
1 change: 1 addition & 0 deletions src/cc2olx/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@ def collect_settings(parsed_args):
"workspace": Path.cwd() / parsed_args.output,
"link_file": parsed_args.link_file,
"passport_file": parsed_args.passport_file,
"relative_links_source": parsed_args.relative_links_source,
}
return settings
16 changes: 14 additions & 2 deletions src/cc2olx/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
""" Utility functions for cc2olx"""

import logging
import string
import csv
import ipaddress
import logging
import re
import string

from cc2olx.constants import CDATA_PATTERN

Expand Down Expand Up @@ -123,3 +124,14 @@ def clean_from_cdata(xml_string: str) -> str:
str: cleaned XML string.
"""
return re.sub(CDATA_PATTERN, r"\g<content>", xml_string, flags=re.DOTALL)


def is_valid_ipv6_address(value: str) -> bool:
"""
Return whether the value is a valid IPv6 address.
"""
try:
ipaddress.IPv6Address(value)
except ValueError:
return False
return True
Empty file.
62 changes: 62 additions & 0 deletions src/cc2olx/validators/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import argparse
import re

from cc2olx.utils import is_valid_ipv6_address


class LinkSourceValidator:
"""
Validate a link source.
"""

UL = "\u00a1-\uffff" # Unicode letters range (must not be a raw string).

# IP patterns
IPV4_REGEX = (
r"(?:0|25[0-5]|2[0-4][0-9]|1[0-9]?[0-9]?|[1-9][0-9]?)"
r"(?:\.(?:0|25[0-5]|2[0-4][0-9]|1[0-9]?[0-9]?|[1-9][0-9]?)){3}"
)
IPV6_REGEX = r"\[[0-9a-f:.]+\]" # (simple regex, validated later)

# Host patterns
HOSTNAME_REGEX = rf"[a-z{UL}0-9](?:[a-z{UL}0-9-]{{0,61}}[a-z{UL}0-9])?"
# Max length for domain name labels is 63 characters per RFC 1034 sec. 3.1
DOMAIN_REGEX = rf"(?:\.(?!-)[a-z{UL}0-9-]{{1,63}}(?<!-))*"
TLD_REGEX = (
r"\." # dot
r"(?!-)" # can't start with a dash
rf"(?:[a-z{UL}-]{{2,63}}" # domain label
r"|xn--[a-z0-9]{1,59})" # or punycode label
r"(?<!-)" # can't end with a dash
r"\.?" # may have a trailing dot
)
HOST_REGEX = "(" + HOSTNAME_REGEX + DOMAIN_REGEX + TLD_REGEX + "|localhost)"

LINK_SOURCE_REGEX = (
r"^https?://" # scheme is validated separately
r"(?:[^\s:@/]+(?::[^\s:@/]*)?@)?" # user:pass authentication
rf"(?P<netloc>{IPV4_REGEX}|{IPV6_REGEX}|{HOST_REGEX})"
r"(?::[0-9]{1,5})?" # port
r"/?" # trailing slash
r"\Z"
)

message = "Enter a valid URL."

def __call__(self, value: str) -> str:
if not (link_source_match := re.fullmatch(self.LINK_SOURCE_REGEX, value, re.IGNORECASE)):
raise argparse.ArgumentTypeError(self.message)

self._validate_ipv6_address(link_source_match.group("netloc"))

return value

def _validate_ipv6_address(self, netloc: str) -> None:
"""
Check netloc correctness if it's an IPv6 address.
"""
potential_ipv6_regex = r"^\[(.+)\](?::[0-9]{1,5})?$"
if netloc_match := re.search(potential_ipv6_regex, netloc):
potential_ip = netloc_match[1]
if not is_valid_ipv6_address(potential_ip):
raise argparse.ArgumentTypeError(self.message)
42 changes: 41 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from argparse import Namespace
from pathlib import Path

import pytest

from cc2olx.cli import parse_args

Expand All @@ -16,7 +19,13 @@ def test_parse_args(imscc_file):
)

assert parsed_args == Namespace(
inputs=[imscc_file], loglevel="INFO", result="folder", link_file=None, passport_file=None, output="output"
inputs=[imscc_file],
loglevel="INFO",
result="folder",
link_file=None,
passport_file=None,
output="output",
relative_links_source=None,
)


Expand All @@ -34,6 +43,7 @@ def test_parse_args_csv_file(imscc_file, link_map_csv):
link_file=link_map_csv,
passport_file=None,
output="output",
relative_links_source=None,
)


Expand All @@ -49,4 +59,34 @@ def test_parse_args_passport_file(imscc_file, passports_csv):
link_file=None,
passport_file=passports_csv,
output="output",
relative_links_source=None,
)


def test_parse_args_with_correct_relative_links_source(imscc_file: Path) -> None:
"""
Positive input test for relative links source argument.
"""
relative_links_source = "https://example.com"

parsed_args = parse_args(["-i", str(imscc_file), "-s", relative_links_source])

assert parsed_args == Namespace(
inputs=[imscc_file],
loglevel="INFO",
result="folder",
link_file=None,
passport_file=None,
output="output",
relative_links_source=relative_links_source,
)


def test_parse_args_with_incorrect_relative_links_source(imscc_file: Path) -> None:
"""
Test arguments parser detects incorrect relative links sources.
"""
relative_links_source = "ws://example.com"

with pytest.raises(SystemExit):
parse_args(["-i", str(imscc_file), "-s", relative_links_source])
Loading
Loading