From bd279b38382a4acf521ce3fe7c48156c2ce3ff07 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 30 Sep 2024 13:50:44 +0000 Subject: [PATCH 1/3] Retrieve list of page IDs and root of the tree from API, and introduce caching --- Dockerfile | 2 +- scraper/src/libretexts2zim/client.py | 141 +++++++++++++++++++++-- scraper/src/libretexts2zim/entrypoint.py | 28 +++++ scraper/src/libretexts2zim/processor.py | 17 ++- scraper/tests-integration/README.md | 6 + scraper/tests-integration/conftest.py | 11 ++ scraper/tests-integration/test_client.py | 42 ++++++- 7 files changed, 231 insertions(+), 16 deletions(-) create mode 100644 scraper/tests-integration/README.md diff --git a/Dockerfile b/Dockerfile index ba36662..df7a428 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,6 +44,6 @@ RUN pip install --no-cache-dir /src/scraper \ # Copy zimui build output COPY --from=zimui /src/dist /src/zimui -ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui +ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui LIBRETEXTS_OUTPUT=/output LIBRETEXTS_TMP=/tmp CMD ["libretexts2zim", "--help"] diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index c6ea083..c80cf0c 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -1,6 +1,9 @@ import datetime +import json import re from collections.abc import Callable +from pathlib import Path +from typing import Any import requests from bs4 import BeautifulSoup, NavigableString @@ -50,7 +53,7 @@ def placeholders( class LibreTextsClient: """Utility functions to read data from libretexts.""" - def __init__(self, library_slug: str) -> None: + def __init__(self, library_slug: str, cache_folder: Path) -> None: """Initializes LibreTextsClient. Paremters: @@ -58,40 +61,144 @@ def __init__(self, library_slug: str) -> None: e.g. `https://geo.libretexts.org/`. """ self.library_slug = library_slug + self.deki_token = None + self.cache_folder = cache_folder @property def library_url(self) -> str: - return f"https://{self.library_slug}.libretexts.org/" + return f"https://{self.library_slug}.libretexts.org" - def _get_text(self, url: str) -> str: + @property + def api_url(self) -> str: + return f"{self.library_url}/@api/deki" + + def _get_cache_file(self, url_subpath_and_query: str) -> Path: + """Get location where HTTP result should be cached""" + if url_subpath_and_query.startswith("/"): + url_subpath_and_query = url_subpath_and_query[1:] + if url_subpath_and_query.endswith("/"): + url_subpath_and_query += "index" + return self.cache_folder / url_subpath_and_query + + def _get_text(self, url_subpath_and_query: str) -> str: """Perform a GET request and return the response as decoded text.""" - logger.debug(f"Fetching {url}") + cache_file = self._get_cache_file(f"text{url_subpath_and_query}") + if cache_file.exists(): + return cache_file.read_text() + cache_file.parent.mkdir(parents=True, exist_ok=True) + + full_url = f"{self.library_url}{url_subpath_and_query}" + logger.debug(f"Fetching {full_url}") resp = requests.get( - url=url, + url=full_url, allow_redirects=True, timeout=HTTP_TIMEOUT_SECONDS, ) resp.raise_for_status() + cache_file.write_text(resp.text) return resp.text + def _get_api_resp( + self, api_sub_path_and_query: str, timeout: float + ) -> requests.Response: + api_url = f"{self.api_url}{api_sub_path_and_query}" + logger.debug(f"Calling API at {api_url}") + resp = requests.get( + url=api_url, + headers={"x-deki-token": self.deki_token}, + timeout=timeout, + ) + resp.raise_for_status() + return resp + + def _get_api_json( + self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS + ) -> Any: + cache_file = self._get_cache_file(f"api_json{api_sub_path}") + if cache_file.exists(): + return json.loads(cache_file.read_text()) + cache_file.parent.mkdir(parents=True, exist_ok=True) + resp = self._get_api_resp( + f"{api_sub_path}?dream.out.format=json", timeout=timeout + ) + result = resp.json() + cache_file.write_text(json.dumps(result)) + return result + + def _get_api_content( + self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS + ) -> bytes | Any: + cache_file = self._get_cache_file(f"api_content{api_sub_path}") + if cache_file.exists(): + return json.loads(cache_file.read_text()) + cache_file.parent.mkdir(parents=True, exist_ok=True) + resp = self._get_api_resp(api_sub_path, timeout=timeout) + result = resp.content + cache_file.write_bytes(result) + return result + def get_home(self) -> LibreTextsHome: - home_content = self._get_text(self.library_url) + """Retrieves data about home page by crawling home page""" + home_content = self._get_text("/") soup = _get_soup(home_content) + self.deki_token = _get_deki_token_from_home(soup) return LibreTextsHome( welcome_text_paragraphs=_get_welcome_text_from_home(soup), welcome_image_url=_get_welcome_image_url_from_home(soup), ) + def get_deki_token(self) -> str: + """Retrieves the API token to use to query the website API""" + if self.deki_token: + return self.deki_token + + home_content = self._get_text("/") + + soup = _get_soup(home_content) + self.deki_token = _get_deki_token_from_home(soup) + return self.deki_token + + def get_all_pages_ids(self): + """Returns the IDs of all pages on current website, exploring the whole tree""" + + tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2) + + page_ids: list[str] = [] + + def _get_page_ids(page_node: Any) -> None: + page_ids.append(page_node["@id"]) + if not page_node["subpages"]: + return + if "@id" in page_node["subpages"]["page"]: + _get_page_ids(page_node["subpages"]["page"]) + else: + for page in page_node["subpages"]["page"]: + _get_page_ids(page) + + _get_page_ids(tree["page"]) + + return page_ids + + def get_root_page_id(self) -> str: + """Returns the ID the root of the tree of pages""" + + tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2) + return tree["page"]["@id"] + def _get_soup(content: str) -> BeautifulSoup: - return BeautifulSoup(content, "lxml") + """Return a BeautifulSoup soup from textual content + This is a utility function to ensure same parser is used in the whole codebase + """ + return BeautifulSoup(content, "lxml") def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str: + """Return the URL of the image found on home header""" branding_div = soup.find("div", class_="LTBranding") if not branding_div: raise LibreTextsParsingError("
with class 'LTBranding' not found") @@ -111,6 +218,7 @@ def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str: def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]: + """Returns the text found on home page""" content_section = soup.find("section", class_="mt-content-container") if not content_section or isinstance(content_section, NavigableString): raise LibreTextsParsingError( @@ -121,3 +229,22 @@ def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]: if paragraph_text := paragraph.text: welcome_text.append(paragraph_text) return welcome_text + + +def _get_deki_token_from_home(soup: BeautifulSoup) -> str: + global_settings = soup.find("script", id="mt-global-settings") + if not global_settings: + logger.debug("home content:") + logger.debug(soup) + raise Exception( + "Failed to retrieve API token to query website API, missing " + "mt-global-settings script" + ) + x_deki_token = json.loads(global_settings.text).get("apiToken", None) + if not x_deki_token: + logger.debug("mt-global-settings script content:") + logger.debug(global_settings.text) + raise Exception( + "Failed to retrieve API token to query website API, missing apiToken." + ) + return x_deki_token diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py index 7edcf76..2417e1f 100644 --- a/scraper/src/libretexts2zim/entrypoint.py +++ b/scraper/src/libretexts2zim/entrypoint.py @@ -8,6 +8,7 @@ MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH, RECOMMENDED_MAX_TITLE_LENGTH, ) +from zimscraperlib.zim.filesystem import validate_zimfile_creatable from libretexts2zim.client import LibreTextsClient from libretexts2zim.constants import ( @@ -177,6 +178,13 @@ def main() -> None: dest="output_folder", ) + parser.add_argument( + "--tmp", + help="Temporary folder for cache, intermediate files, ... Default: tmp", + default=os.getenv("LIBRETEXTS_TMP", "tmp"), + dest="tmp_folder", + ) + parser.add_argument( "--debug", help="Enable verbose output", action="store_true", default=False ) @@ -191,15 +199,35 @@ def main() -> None: default=os.getenv("LIBRETEXTS_ZIMUI_DIST", "../zimui/dist"), ) + parser.add_argument( + "--keep-cache", + help="Keep cache of website responses", + action="store_true", + default=False, + ) + args = parser.parse_args() logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO) + output_folder = Path(args.output_folder) + output_folder.mkdir(exist_ok=True) + validate_zimfile_creatable(output_folder, "test.txt") + + tmp_folder = Path(args.tmp_folder) + tmp_folder.mkdir(exist_ok=True) + validate_zimfile_creatable(tmp_folder, "test.txt") + try: zim_config = ZimConfig.of(args) doc_filter = ContentFilter.of(args) + + cache_folder = tmp_folder / "cache" + cache_folder.mkdir() + libretexts_client = LibreTextsClient( library_slug=args.library_slug, + cache_folder=cache_folder, ) Processor( diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index 90bacb6..146259f 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -9,6 +9,7 @@ ) from zimscraperlib.image import resize_image from zimscraperlib.zim import Creator +from zimscraperlib.zim.filesystem import validate_zimfile_creatable from zimscraperlib.zim.indexing import IndexData from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata @@ -117,8 +118,6 @@ def __init__( self.zimui_dist = zimui_dist self.overwrite_existing_zim = overwrite_existing_zim - self.output_folder.mkdir(exist_ok=True) - self.zim_illustration_path = self.libretexts_newsite_path( "header_logo_mini.png" ) @@ -145,11 +144,17 @@ def run(self) -> Path: name=self.zim_config.library_name, slug=self.libretexts_client.library_slug ) formatted_config = self.zim_config.format(metadata.placeholders()) - zim_path = Path(self.output_folder, f"{formatted_config.file_name_format}.zim") + zim_file_name = f"{formatted_config.file_name_format}.zim" + zim_path = self.output_folder / zim_file_name + + if zim_path.exists(): + if self.overwrite_existing_zim: + zim_path.unlink() + else: + logger.error(f" {zim_path} already exists, aborting.") + raise SystemExit(2) - if zim_path.exists() and not self.overwrite_existing_zim: - logger.error(f" {zim_path} already exists, aborting.") - raise SystemExit(2) + validate_zimfile_creatable(self.output_folder, zim_file_name) logger.info(f" Writing to: {zim_path}") diff --git a/scraper/tests-integration/README.md b/scraper/tests-integration/README.md new file mode 100644 index 0000000..639565c --- /dev/null +++ b/scraper/tests-integration/README.md @@ -0,0 +1,6 @@ +This folder contains integration tests, testing how the scraper behaves: + +- with a real libretexts website +- from end-to-end + +They are targetted at being ran from scraper Docker image from Github workflow(s). diff --git a/scraper/tests-integration/conftest.py b/scraper/tests-integration/conftest.py index 98d237a..250300d 100644 --- a/scraper/tests-integration/conftest.py +++ b/scraper/tests-integration/conftest.py @@ -1,3 +1,8 @@ +import tempfile +from collections.abc import Generator +from pathlib import Path +from typing import Any + import pytest @@ -6,6 +11,12 @@ def libretexts_slug() -> str: return "geo" +@pytest.fixture(scope="module") +def cache_folder() -> Generator[Path, Any, Any]: + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture(scope="module") def libretexts_url(libretexts_slug: str) -> str: return f"https://{libretexts_slug}.libretexts.org" diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index 2e4889c..5cf9ebf 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -1,4 +1,5 @@ import io +from pathlib import Path import pytest from zimscraperlib.download import ( @@ -10,8 +11,8 @@ @pytest.fixture(scope="module") -def client(libretexts_slug: str) -> LibreTextsClient: - return LibreTextsClient(library_slug=libretexts_slug) +def client(libretexts_slug: str, cache_folder: Path) -> LibreTextsClient: + return LibreTextsClient(library_slug=libretexts_slug, cache_folder=cache_folder) @pytest.fixture(scope="module") @@ -19,6 +20,43 @@ def home(client: LibreTextsClient) -> LibreTextsHome: return client.get_home() +@pytest.fixture(scope="module") +def deki_token(client: LibreTextsClient) -> str: + return client.get_deki_token() + + +@pytest.fixture(scope="module") +def minimum_number_of_pages() -> int: + return 8000 + + +@pytest.fixture(scope="module") +def root_page_id() -> str: + return "34" + + +def test_get_deki_token(deki_token: str): + """Ensures we achieve to get a deki_token""" + assert deki_token + + +def test_get_all_pages_ids( + client: LibreTextsClient, + minimum_number_of_pages: int, + deki_token: str, # noqa: ARG001 +): + pages_ids = client.get_all_pages_ids() + assert len(pages_ids) > minimum_number_of_pages + + +def test_get_root_page_id( + client: LibreTextsClient, + root_page_id: str, + deki_token: str, # noqa: ARG001 +): + assert client.get_root_page_id() == root_page_id + + def test_get_home_image_url(home: LibreTextsHome): """Ensures proper image url is retrieved""" assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png" From c3bb8c52eb915a4cc56571260663b39c00a0c9c0 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 3 Oct 2024 13:36:23 +0000 Subject: [PATCH 2/3] fixup! Retrieve list of page IDs and root of the tree from API, and introduce caching --- Dockerfile | 4 +++- scraper/pyproject.toml | 1 - scraper/src/libretexts2zim/__main__.py | 13 ++++++++----- scraper/src/libretexts2zim/client.py | 19 ++++++++++--------- scraper/src/libretexts2zim/entrypoint.py | 8 ++------ 5 files changed, 23 insertions(+), 22 deletions(-) diff --git a/Dockerfile b/Dockerfile index df7a428..0713679 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,6 +44,8 @@ RUN pip install --no-cache-dir /src/scraper \ # Copy zimui build output COPY --from=zimui /src/dist /src/zimui -ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui LIBRETEXTS_OUTPUT=/output LIBRETEXTS_TMP=/tmp +ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui \ + LIBRETEXTS_OUTPUT=/output \ + LIBRETEXTS_TMP=/tmp CMD ["libretexts2zim", "--help"] diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index b557333..fdbd0de 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -47,7 +47,6 @@ dev = [ [project.scripts] libretexts2zim = "libretexts2zim.__main__:main" -libretexts2zim-playlists = "libretexts2zim.playlists.__main__:main" [tool.hatch.version] path = "src/libretexts2zim/__about__.py" diff --git a/scraper/src/libretexts2zim/__main__.py b/scraper/src/libretexts2zim/__main__.py index 9ee85b7..56d7e47 100644 --- a/scraper/src/libretexts2zim/__main__.py +++ b/scraper/src/libretexts2zim/__main__.py @@ -1,9 +1,12 @@ -#!/usr/bin/env python3 -# vim: ai ts=4 sts=4 et sw=4 nu +import tempfile -import sys +from libretexts2zim.entrypoint import main as entrypoint + + +def main(): + with tempfile.TemporaryDirectory() as tmpdir: + entrypoint(tmpdir) -from libretexts2zim.entrypoint import main if __name__ == "__main__": - sys.exit(main()) + main() diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index c80cf0c..794755d 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -11,7 +11,8 @@ from libretexts2zim.constants import logger -HTTP_TIMEOUT_SECONDS = 15 +HTTP_TIMEOUT_NORMAL_SECONDS = 15 +HTTP_TIMEOUT_LONG_SECONDS = 30 class LibreTextsParsingError(Exception): @@ -74,8 +75,7 @@ def api_url(self) -> str: def _get_cache_file(self, url_subpath_and_query: str) -> Path: """Get location where HTTP result should be cached""" - if url_subpath_and_query.startswith("/"): - url_subpath_and_query = url_subpath_and_query[1:] + url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query) if url_subpath_and_query.endswith("/"): url_subpath_and_query += "index" return self.cache_folder / url_subpath_and_query @@ -94,7 +94,7 @@ def _get_text(self, url_subpath_and_query: str) -> str: resp = requests.get( url=full_url, allow_redirects=True, - timeout=HTTP_TIMEOUT_SECONDS, + timeout=HTTP_TIMEOUT_NORMAL_SECONDS, ) resp.raise_for_status() @@ -115,7 +115,7 @@ def _get_api_resp( return resp def _get_api_json( - self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS + self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS ) -> Any: cache_file = self._get_cache_file(f"api_json{api_sub_path}") if cache_file.exists(): @@ -129,11 +129,11 @@ def _get_api_json( return result def _get_api_content( - self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS + self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS ) -> bytes | Any: cache_file = self._get_cache_file(f"api_content{api_sub_path}") if cache_file.exists(): - return json.loads(cache_file.read_text()) + return cache_file.read_bytes() cache_file.parent.mkdir(parents=True, exist_ok=True) resp = self._get_api_resp(api_sub_path, timeout=timeout) result = resp.content @@ -165,7 +165,7 @@ def get_deki_token(self) -> str: def get_all_pages_ids(self): """Returns the IDs of all pages on current website, exploring the whole tree""" - tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2) + tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) page_ids: list[str] = [] @@ -186,7 +186,7 @@ def _get_page_ids(page_node: Any) -> None: def get_root_page_id(self) -> str: """Returns the ID the root of the tree of pages""" - tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2) + tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) return tree["page"]["@id"] @@ -197,6 +197,7 @@ def _get_soup(content: str) -> BeautifulSoup: """ return BeautifulSoup(content, "lxml") + def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str: """Return the URL of the image found on home header""" branding_div = soup.find("div", class_="LTBranding") diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py index 2417e1f..1d8ef52 100644 --- a/scraper/src/libretexts2zim/entrypoint.py +++ b/scraper/src/libretexts2zim/entrypoint.py @@ -138,7 +138,7 @@ def add_content_filter_flags(parser: argparse.ArgumentParser): ) -def main() -> None: +def main(tmpdir: str) -> None: parser = argparse.ArgumentParser( prog=NAME, ) @@ -181,7 +181,7 @@ def main() -> None: parser.add_argument( "--tmp", help="Temporary folder for cache, intermediate files, ... Default: tmp", - default=os.getenv("LIBRETEXTS_TMP", "tmp"), + default=os.getenv("LIBRETEXTS_TMP", tmpdir), dest="tmp_folder", ) @@ -245,7 +245,3 @@ def main() -> None: logger.exception(exc) logger.error(f"Generation failed with the following error: {exc}") raise SystemExit(1) from exc - - -if __name__ == "__main__": - main() From 40cb3425184f2b7e4233aaea6871d63fbb759309 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 3 Oct 2024 14:19:13 +0000 Subject: [PATCH 3/3] Make Codecov only informational, most code is covered only in integration tests, not considered by Codecov --- codecov.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 codecov.yml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..e3ae2cc --- /dev/null +++ b/codecov.yml @@ -0,0 +1,11 @@ +coverage: + status: + project: + default: + informational: true + patch: + default: + informational: true + changes: + default: + informational: true