diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index a448e94..e444a02 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -57,6 +57,21 @@ class DekiTree(BaseModel): root: DekiPage pages: dict[str, DekiPage] = {} + def sub_tree(self, subroot_id: str) -> "DekiTree": + """Returns a sub-tree, starting at give page id""" + new_root = self.pages[subroot_id] + tree = DekiTree(root=new_root) + tree.pages[new_root.id] = new_root + children_to_explore = [*new_root.children] + while len(children_to_explore) > 0: + child = children_to_explore[0] + children_to_explore.remove(child) + if child.id in tree.pages: + continue # safe-guard + tree.pages[child.id] = child + children_to_explore.extend(child.children) + return tree + class LibreTextsMetadata(BaseModel): """Metadata about a course.""" diff --git a/scraper/src/libretexts2zim/generator.py b/scraper/src/libretexts2zim/generator.py index b2d4000..18a1d4f 100644 --- a/scraper/src/libretexts2zim/generator.py +++ b/scraper/src/libretexts2zim/generator.py @@ -45,6 +45,8 @@ class ContentFilter(BaseModel): page_id_include: str | None # If specified, page with title matching the regex are excluded. page_title_exclude: str | None + # If specified, only this page and its subpages will be included. + root_page_id: str | None @staticmethod def add_flags(parser: argparse.ArgumentParser): @@ -72,6 +74,12 @@ def add_flags(parser: argparse.ArgumentParser): metavar="REGEX", ) + parser.add_argument( + "--root-page-id", + help="ID of the root page to include in ZIM. Only this page and its" + " subpages will be included in the ZIM", + ) + @staticmethod def of(namespace: argparse.Namespace) -> "ContentFilter": """Parses a namespace to create a new DocFilter.""" @@ -80,6 +88,9 @@ def of(namespace: argparse.Namespace) -> "ContentFilter": def filter(self, page_tree: DekiTree) -> list[DekiPage]: """Filters pages based on the user's choices.""" + if self.root_page_id: + page_tree = page_tree.sub_tree(self.root_page_id) + title_include_re = ( re.compile(self.page_title_include, re.IGNORECASE) if self.page_title_include diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index b556c94..b4f45f9 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -7,7 +7,7 @@ ) from zimscraperlib.image.probing import format_for -from libretexts2zim.client import LibreTextsClient, LibreTextsHome +from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome @pytest.fixture(scope="module") @@ -40,6 +40,14 @@ def nb_root_children() -> int: return 6 +@pytest.fixture(scope="module") +def page_tree( + client: LibreTextsClient, + deki_token: str, # noqa: ARG001 +) -> DekiTree: + return client.get_page_tree() + + def test_get_deki_token(deki_token: str): """Ensures we achieve to get a deki_token""" assert deki_token @@ -62,15 +70,18 @@ def test_get_root_page_id( assert client.get_root_page_id() == root_page_id -def test_get_page_tree( - client: LibreTextsClient, +def test_get_page_tree_pages( + page_tree: DekiTree, minimum_number_of_pages: int, - deki_token: str, # noqa: ARG001 +): + assert len(page_tree.pages.keys()) > minimum_number_of_pages + + +def test_get_page_tree_root( + page_tree: DekiTree, root_page_id: str, nb_root_children: int, ): - page_tree = client.get_page_tree() - assert len(page_tree.pages.keys()) > minimum_number_of_pages assert page_tree.root.id == root_page_id assert len(page_tree.root.children) == nb_root_children assert page_tree.root.title @@ -78,6 +89,22 @@ def test_get_page_tree( assert child.title +def test_get_page_tree_subtree( + page_tree: DekiTree, +): + + # 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science + subtree1 = page_tree.sub_tree("28207") + # 4 = "1. Understransding Science" + "1.1: What is Science?" + # + "1.2: The Scientific Method" + "1.3: The Study of Geology" + assert len(subtree1.pages.keys()) == 4 + + # 28196 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College + subtree2 = page_tree.sub_tree("28196") + # 94 is number retrieved in Oct. 2024, might change + assert len(subtree2.pages.keys()) == 94 + + def test_get_home_image_url(home: LibreTextsHome): """Ensures proper image url is retrieved""" assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png" diff --git a/scraper/tests/test_generator.py b/scraper/tests/test_generator.py index f8cf4a7..e1f12cd 100644 --- a/scraper/tests/test_generator.py +++ b/scraper/tests/test_generator.py @@ -59,6 +59,7 @@ def deki_tree() -> DekiTree: page_title_include=r"^1\..*", page_title_exclude=None, page_id_include=None, + root_page_id=None, ), ["24", "25", "26", "27", "28"], id="include_1", @@ -68,6 +69,7 @@ def deki_tree() -> DekiTree: page_title_include=r"^2\..*", page_title_exclude=None, page_id_include=None, + root_page_id=None, ), ["24", "29", "30", "31", "32"], id="include_2", @@ -77,6 +79,7 @@ def deki_tree() -> DekiTree: page_title_include=None, page_title_exclude=None, page_id_include="26,27,28", + root_page_id=None, ), ["24", "25", "26", "27", "28"], id="include_3", @@ -86,6 +89,7 @@ def deki_tree() -> DekiTree: page_title_include="ground", page_title_exclude=None, page_id_include=None, + root_page_id=None, ), ["24", "29", "30", "33", "34"], id="include_4", @@ -95,6 +99,7 @@ def deki_tree() -> DekiTree: page_title_include=r"^1\..*", page_title_exclude="Tree", page_id_include=None, + root_page_id=None, ), ["24", "25", "26", "28"], id="include_exclude_1", @@ -104,6 +109,7 @@ def deki_tree() -> DekiTree: page_title_include=None, page_title_exclude="Tree", page_id_include="26,27,28", + root_page_id=None, ), ["24", "25", "26", "28"], id="include_exclude_2", @@ -113,6 +119,7 @@ def deki_tree() -> DekiTree: page_title_include="ground", page_title_exclude="^2", page_id_include=None, + root_page_id=None, ), ["24", "33", "34"], id="include_exclude_3", @@ -122,6 +129,7 @@ def deki_tree() -> DekiTree: page_title_include=r"^1\..*", page_title_exclude="tree", page_id_include=None, + root_page_id=None, ), ["24", "25", "26", "28"], id="include_exclude_case_insensitive", @@ -131,6 +139,7 @@ def deki_tree() -> DekiTree: page_title_include="tree", page_title_exclude=None, page_id_include=None, + root_page_id=None, ), ["24", "25", "27"], id="include_case_insensitive", @@ -140,10 +149,31 @@ def deki_tree() -> DekiTree: page_title_include="^tree", page_title_exclude=None, page_id_include=None, + root_page_id=None, ), [], id="include_no_match", ), + pytest.param( + ContentFilter( + page_title_include=None, + page_title_exclude=None, + page_id_include=None, + root_page_id="25", + ), + ["25", "26", "27", "28"], + id="root_page_id", + ), + pytest.param( + ContentFilter( + page_title_include=r"^1\.1.*", + page_title_exclude=None, + page_id_include=None, + root_page_id="25", + ), + ["25", "26"], + id="root_page_id_and_include", + ), ], ) def test_content_filter(