Skip to content

Commit

Permalink
Merge pull request #29 from openzim/add_standard_css
Browse files Browse the repository at this point in the history
Apply proper CSS for proper page display - step 1
  • Loading branch information
benoit74 authored Oct 10, 2024
2 parents 733c35a + 4749161 commit 152e8b7
Show file tree
Hide file tree
Showing 10 changed files with 614 additions and 7 deletions.
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ To achieve this, first build the Docker image based on current code base.
docker build -t local-libretexts2zim .
```

Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, but you could use any other one of interest for your UI developments).
Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, with only page id 28207 and its children but you could use any other one of interest for your UI developments).

```
docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --overwrite
docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --root-page-id 28207 --overwrite
```

Extract interesting ZIM content and move it to `public` folder.
Expand Down
1 change: 1 addition & 0 deletions scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"beautifulsoup4==4.12.3",
"types-beautifulsoup4==4.12.0.20240907",
"lxml==5.3.0",
"tinycss2==1.3.0",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
34 changes: 33 additions & 1 deletion scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class LibreTextsParsingError(Exception):
class LibreTextsHome(BaseModel):
welcome_text_paragraphs: list[str]
welcome_image_url: str
screen_css_url: str
print_css_url: str


LibraryPageId = str
Expand Down Expand Up @@ -206,6 +208,8 @@ def get_home(self) -> LibreTextsHome:
return LibreTextsHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
welcome_image_url=_get_welcome_image_url_from_home(soup),
screen_css_url=_get_screen_css_url_from_home(soup),
print_css_url=_get_print_css_url_from_home(soup),
)

def get_deki_token(self) -> str:
Expand Down Expand Up @@ -308,7 +312,7 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
if tree["body"][1]["@target"] != "toc":
raise LibreTextsParsingError(
f"Unexpected second body element of /pages/{page.id}/contents, "
f"@target property is '{tree["body"][1]["@target"]}' while only 'toc' "
f"@target property is '{tree['body'][1]['@target']}' while only 'toc' "
"is expected"
)
return LibraryPageContent(html_body=tree["body"][0])
Expand Down Expand Up @@ -373,3 +377,31 @@ def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
"Failed to retrieve API token to query website API, missing apiToken."
)
return x_deki_token


def _get_any_css_url_from_home(soup: BeautifulSoup, media: str) -> str:
"""Returns the URL of any media CSS found on home page
This function expects there is only one <style /> with a media attribute per page
and returns the URL of this tag. This is is the case on libretexts.org as of October
2024, might be a bit fragile.
"""
links = soup.find_all("link", {"rel": "stylesheet", "media": media})
if len(links) != 1:
raise LibreTextsParsingError(
f"Failed to find {media} CSS URL in home page, {len(links)} link(s) found"
)
css_url = links[0].get("href", None)
if not css_url:
raise LibreTextsParsingError("screen CSS link has no href")
return css_url


def _get_screen_css_url_from_home(soup: BeautifulSoup) -> str:
"""Returns the URL of screen CSS found on home page"""
return _get_any_css_url_from_home(soup, "screen")


def _get_print_css_url_from_home(soup: BeautifulSoup) -> str:
"""Returns the URL of print CSS found on home page"""
return _get_any_css_url_from_home(soup, "print")
137 changes: 137 additions & 0 deletions scraper/src/libretexts2zim/css.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from collections.abc import Iterable
from pathlib import Path
from urllib.parse import urljoin, urlparse

from tinycss2 import ast, parse_stylesheet_bytes, serialize # pyright: ignore
from tinycss2.serializer import serialize_url # pyright: ignore

from libretexts2zim.utils import get_asset_path_from_url

OriginalUrl = str
FullZimPath = Path
RelativeCssPath = Path


class CssProcessor:
"""Utility to to process CSS, extract assets and rewrite URLs
This utility can process multiple CSS documents that will be stored in a ZIM
It extracts the list of assets (images, fonts) that are used in the CSS documents
and compute appropriate ZIM paths for each of them.
Arguments:
css_target_path: "folder" where the CSS documents that will be processed will be
stored in the ZIM
css_assets_root_path: "folder" where the CSS assets referenced in the CSS
documents will be stored in the ZIM
"""

def __init__(
self,
css_target_path: Path = Path("/content"),
css_assets_root_path: Path = Path("/content/css_assets"),
) -> None:
self.css_target_path = css_target_path
self.css_assets_root_path = css_assets_root_path
self.css_assets: dict[OriginalUrl, FullZimPath] = {}
self.used_paths: list[RelativeCssPath] = []

def process(self, css_original_url: str, css_content: bytes) -> str:
"""Rewrite CSS rules and update list of assets to fetch
This function updates the CSS rules to target assets path inside the ZIM
It also updates the list of `css_assets` which is the list of online resources
referenced inside the ZIM and which should be fetched and stored inside the ZIM
for proper CSS operation.
"""
rules, _ = parse_stylesheet_bytes( # pyright: ignore[reportUnknownVariableType]
css_content
)
self._process_list(
css_original_url,
rules, # pyright: ignore[reportUnknownArgumentType]
)
return serialize(rules)

def _process_url(
self, css_original_url: str, css_url: str
) -> RelativeCssPath | None:
"""Process a URL which has been found in CSS rules
- Transforms the URL into a ZIM path
- Updates the list of assets to retrieve
"""
original_url = urljoin(css_original_url, css_url)
original_url_parsed = urlparse(original_url)
if original_url_parsed.scheme.lower() not in ["http", "https"]:
return None
if original_url in self.css_assets:
return self.css_assets[original_url].relative_to(self.css_target_path)
relative_path = get_asset_path_from_url(original_url, self.used_paths)
self.used_paths.append(relative_path)
target_path = self.css_assets_root_path / relative_path
self.css_assets[original_url] = target_path
return target_path.relative_to(self.css_target_path)

def _process_node(self, css_original_url: str, node: ast.Node):
"""Process one single CSS node"""
if isinstance(
node,
ast.QualifiedRule
| ast.SquareBracketsBlock
| ast.ParenthesesBlock
| ast.CurlyBracketsBlock,
):
self._process_list(
css_original_url,
node.content, # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType]
)
elif isinstance(node, ast.FunctionBlock):
if node.lower_name == "url": # pyright: ignore[reportUnknownMemberType]
url_node: ast.Node = node.arguments[0] # pyright: ignore
relative_css_path = self._process_url(
css_original_url,
url_node.value, # pyright: ignore
)
if not relative_css_path:
return
url_node.value = str(relative_css_path) # pyright: ignore
url_node.representation = ( # pyright: ignore
f'"{serialize_url(str(relative_css_path))}"'
)

else:
self._process_list(
css_original_url,
node.arguments, # pyright: ignore
)
elif isinstance(node, ast.AtRule):
self._process_list(
css_original_url,
node.prelude, # pyright: ignore
)
self._process_list(
css_original_url,
node.content, # pyright: ignore
)
elif isinstance(node, ast.Declaration):
self._process_list(
css_original_url,
node.value, # pyright: ignore
)
elif isinstance(node, ast.URLToken):
relative_css_path = self._process_url(
css_original_url,
node.value, # pyright: ignore
)
if not relative_css_path:
return
node.value = str(relative_css_path)
node.representation = f"url({serialize_url(str(relative_css_path))})"

def _process_list(self, css_original_url: str, nodes: Iterable[ast.Node] | None):
"""Process a list of CSS nodes"""
if not nodes:
return
for node in nodes:
self._process_node(css_original_url, node)
36 changes: 36 additions & 0 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path

from pydantic import BaseModel
from requests.exceptions import HTTPError
from zimscraperlib.download import (
stream_file, # pyright: ignore[reportUnknownVariableType]
)
Expand All @@ -21,6 +22,7 @@
LibreTextsMetadata,
)
from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from libretexts2zim.css import CssProcessor
from libretexts2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -261,11 +263,45 @@ def run(self) -> Path:

logger.info(" Fetching and storing home page...")
home = self.libretexts_client.get_home()

welcome_image = BytesIO()
stream_file(home.welcome_image_url, byte_stream=welcome_image)
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

css_processor = CssProcessor()
screen_css = BytesIO()
stream_file(home.screen_css_url, byte_stream=screen_css)
result = css_processor.process(
css_original_url=home.screen_css_url, css_content=screen_css.getvalue()
)
add_item_for(creator, "content/screen.css", content=result)
del screen_css

print_css = BytesIO()
stream_file(home.print_css_url, byte_stream=print_css)
result = css_processor.process(
css_original_url=home.print_css_url, css_content=print_css.getvalue()
)
add_item_for(creator, "content/print.css", content=result)
del print_css

logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...")
for asset_url, asset_path in css_processor.css_assets.items():
try:
css_asset = BytesIO()
stream_file(asset_url, byte_stream=css_asset)
add_item_for(
creator, str(asset_path)[1:], content=css_asset.getvalue()
)
logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM")
del css_asset
except HTTPError as exc:
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path} due to {exc}")

logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}")
for file in self.zimui_dist.rglob("*"):
if file.is_dir():
Expand Down
30 changes: 30 additions & 0 deletions scraper/src/libretexts2zim/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from pathlib import Path
from urllib.parse import urlparse


def get_asset_path_from_url(online_url: str, already_used_paths: list[Path]) -> Path:
"""Computes the path where one should store its asset based on its online URL
This function try to:
- preserve as much the online path as possible
- simplify filename (e.g. dropping querystring) to simply ZimPath
"""
original_path = Path(urlparse(online_url).path)
target_parent = Path(
*[
parent.name
for parent in reversed(original_path.parents)
if parent.name and parent.name != ".."
]
)

index = 0
while True:
relative_path = (
target_parent / f"{original_path.stem}{'_' + str(index) if index else ''}"
f"{original_path.suffix}"
)
if relative_path not in already_used_paths:
break
index += 1
return relative_path
16 changes: 16 additions & 0 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,19 @@ def test_get_home_welcome_text_paragraphs(
def test_get_home_page_content(client: LibreTextsClient, page_tree: LibraryTree):
"""Ensures we can get content of root page"""
assert client.get_page_content(page_tree.root).html_body


def test_get_home_screen_css_url(home: LibreTextsHome):
"""Ensures proper screen CSS url is retrieved"""
assert (
home.screen_css_url
== "https://a.mtstatic.com/@cache/layout/anonymous.css?_=715eca8811db7abb8e6f0555936e020d_Z2VvLmxpYnJldGV4dHMub3Jn:site_4038"
)


def test_get_home_print_css_url(home: LibreTextsHome):
"""Ensures proper print CSS url is retrieved"""
assert (
home.print_css_url
== "https://a.mtstatic.com/@cache/layout/print.css?_=99d83fb44eaebe60981933ec554d138d:site_4038"
)
Loading

0 comments on commit 152e8b7

Please sign in to comment.