From 4749161272e7ff4987606cd9322846f2d6e25980 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 8 Oct 2024 09:14:26 +0000 Subject: [PATCH] Apply proper CSS for proper page display - step 1 This first step takes care of CSS stylesheets which are in an external file (two indeed, one for screen and one for print). It does not consider inline CSS which is needed and will be handled in a step 2. --- CONTRIBUTING.md | 4 +- scraper/pyproject.toml | 1 + scraper/src/libretexts2zim/client.py | 34 ++- scraper/src/libretexts2zim/css.py | 137 +++++++++++ scraper/src/libretexts2zim/processor.py | 36 +++ scraper/src/libretexts2zim/utils.py | 30 +++ scraper/tests-integration/test_client.py | 16 ++ scraper/tests/test_css.py | 294 +++++++++++++++++++++++ scraper/tests/test_utils.py | 59 +++++ zimui/index.html | 10 +- 10 files changed, 614 insertions(+), 7 deletions(-) create mode 100644 scraper/src/libretexts2zim/css.py create mode 100644 scraper/src/libretexts2zim/utils.py create mode 100644 scraper/tests/test_css.py create mode 100644 scraper/tests/test_utils.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4205369..d4cefca 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,10 +24,10 @@ To achieve this, first build the Docker image based on current code base. docker build -t local-libretexts2zim . ``` -Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, but you could use any other one of interest for your UI developments). +Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, with only page id 28207 and its children but you could use any other one of interest for your UI developments). ``` -docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --overwrite +docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --root-page-id 28207 --overwrite ``` Extract interesting ZIM content and move it to `public` folder. diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index fdbd0de..97b2ec8 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "beautifulsoup4==4.12.3", "types-beautifulsoup4==4.12.0.20240907", "lxml==5.3.0", + "tinycss2==1.3.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index a08952d..7d74402 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -22,6 +22,8 @@ class LibreTextsParsingError(Exception): class LibreTextsHome(BaseModel): welcome_text_paragraphs: list[str] welcome_image_url: str + screen_css_url: str + print_css_url: str LibraryPageId = str @@ -206,6 +208,8 @@ def get_home(self) -> LibreTextsHome: return LibreTextsHome( welcome_text_paragraphs=_get_welcome_text_from_home(soup), welcome_image_url=_get_welcome_image_url_from_home(soup), + screen_css_url=_get_screen_css_url_from_home(soup), + print_css_url=_get_print_css_url_from_home(soup), ) def get_deki_token(self) -> str: @@ -308,7 +312,7 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent: if tree["body"][1]["@target"] != "toc": raise LibreTextsParsingError( f"Unexpected second body element of /pages/{page.id}/contents, " - f"@target property is '{tree["body"][1]["@target"]}' while only 'toc' " + f"@target property is '{tree['body'][1]['@target']}' while only 'toc' " "is expected" ) return LibraryPageContent(html_body=tree["body"][0]) @@ -373,3 +377,31 @@ def _get_deki_token_from_home(soup: BeautifulSoup) -> str: "Failed to retrieve API token to query website API, missing apiToken." ) return x_deki_token + + +def _get_any_css_url_from_home(soup: BeautifulSoup, media: str) -> str: + """Returns the URL of any media CSS found on home page + + This function expects there is only one