diff --git a/pyproject.toml b/pyproject.toml index 465c7ba..d8ae2d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,9 @@ requires-python = ">=3.12,<3.13" description = "Make ZIM files from DevDocs.io" readme = "README.md" dependencies = [ - "requests==2.31.0", + "requests==2.32.3", + "pydantic==2.8.2", + "zimscraperlib==3.4.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/devdocs2zim/client.py b/src/devdocs2zim/client.py new file mode 100644 index 0000000..8ee84a4 --- /dev/null +++ b/src/devdocs2zim/client.py @@ -0,0 +1,197 @@ +import requests +from pydantic import BaseModel, TypeAdapter + +from devdocs2zim.constants import logger + +HTTP_TIMEOUT_SECONDS = 15 + + +class DevdocsMetadataLinks(BaseModel): + """Project links for a specific documentation set.""" + + # Home page for the project. + home: str = "" + # Link to the project's source code. + code: str = "" + + +class DevdocsMetadata(BaseModel): + """Metadata about a documentation set.""" + + # Human readable name for the documentation. + name: str + # Directory name devdocs puts the docs under. Takes the format: + # name[~version] e.g. "python" or "python-3.10". + slug: str + # Links to project resources. + links: DevdocsMetadataLinks | None = None + # Shortened version displayed in devdocs, if any. Second part of the slug. + version: str = "" + # Specific release of the software the documentation is for, if any. + release: str = "" + # License and attribution information, if any. + attribution: str = "" + + @property + def slug_without_version(self): + return self.slug.split("~")[0] + + def placeholders(self) -> dict[str, str]: + """Gets placeholders for filenames.""" + home_link = "" + code_link = "" + if self.links is not None: + home_link = self.links.home + code_link = self.links.code + + full_name = self.name + if self.version: + full_name += f" {self.version}" + + # properties are inspired by what devdocs uses for their frontend: + # https://github.com/freeCodeCamp/devdocs/blob/6caa5eb1b18ab8d34034f319024bd81877035b36/lib/app.rb#L110 + return { + "name": self.name, + "full_name": full_name, + "slug": self.slug, + "version": self.version, + "release": self.release, + "attribution": self.attribution, + "home_link": home_link, + "code_link": code_link, + "slug_without_version": self.slug_without_version, + } + + +class DevdocsIndexEntry(BaseModel): + """A link to a document in the sidebar.""" + + # Display name for the entry. + name: str + + # Path to the entry in the db.json file. This may contain a fragment identifier + # linking to an anchor tag e.g. #fragment that would not exist in the db.json file. + path: str + + # Name of the type (section) the entry is located under. + type: str + + @property + def path_without_fragment(self) -> str: + """Key in db.json for the file's contents.""" + return self.path.split("#")[0] + + +class DevdocsIndexType(BaseModel): + """A section header for documentation.""" + + # Display name for the section. + name: str + + # Number of documents in the section. + count: int + + # Section slug. This appears to be unused. + slug: str + + +class DevdocsIndex(BaseModel): + """Represents entries in the //index.json file for each resource.""" + + # List of entries. + entries: list[DevdocsIndexEntry] + + # List of "types" or section headings. + # These are displayed mostly in order, except regular expressions are used to sort: + # https://github.com/freeCodeCamp/devdocs/blob/e28f81d3218bdbad7eac0540c58c11c7fe1d33d3/assets/javascripts/collections/types.js#L3 + types: list[DevdocsIndexType] + + +class DevdocsClient: + """Utility functions to read data from devdocs.""" + + def __init__(self, documents_url: str, frontend_url: str) -> None: + """Initializes DevdocsClient. + + Paremters: + documents_url: Scheme, hostname, and port for the Devdocs documents server + e.g. `https://documents.devdocs.io`. + frontend_url: Scheme, hostname, and port for the Devdocs frontend server + e.g. `https://devdocs.io`. + """ + self.documents_url = documents_url + self.frontend_url = frontend_url + + def _get_text(self, url: str) -> str: + """Perform a GET request and return the response as decoded text.""" + + logger.debug(f"Fetching {url}") + + resp = requests.get( + url=url, + allow_redirects=True, + timeout=HTTP_TIMEOUT_SECONDS, + ) + resp.raise_for_status() + + return resp.text + + def read_frontend_file(self, file_path: str) -> str: + """Read a file from the devdocs frontend server. + + Parameters: + file_path: Path of the file relative to the root. + """ + return self._get_text(f"{self.frontend_url}/{file_path}") + + def read_application_css(self) -> str: + """Read the app's CSS which includes classes for normalizing content.""" + + return self.read_frontend_file("application.css") + + def list_docs(self) -> list[DevdocsMetadata]: + """List the documents devdocs currently has published.""" + + # NOTE: There is also a backend file named docs.json, but it + # is missing attribution information. + file_contents = self.read_frontend_file("docs.json") + + return TypeAdapter(list[DevdocsMetadata]).validate_json(file_contents) + + def read_doc_file(self, doc_slug: str, file_name: str) -> str: + """Read a file from the devdocs documents server. + + Parameters: + doc_slug: The document's slug e.g. language~v123. + file_name: Name of the file under the slug e.g. index.json. + """ + + # As of 2024-07-17 the largest file is scala~2.12_library/db.json at 144M. + # Tested by building the devdocs container image. + # + # This amount should fit in memory fine, but we need to be careful not to + # cache these large vaules in memory. + return self._get_text(f"{self.documents_url}/{doc_slug}/{file_name}") + + def get_index(self, doc_slug: str) -> DevdocsIndex: + """Fetch the set of headings and entries that make up the navigation sidebar.""" + + file_contents = self.read_doc_file(doc_slug, "index.json") + + return DevdocsIndex.model_validate_json(file_contents) + + def get_meta(self, doc_slug: str) -> DevdocsMetadata: + """Fetch metadata about the given document. + + Prefer using list_docs and filtering if possible because + the metadata returned there is more complete. + """ + file_contents = self.read_doc_file(doc_slug, "meta.json") + + return DevdocsMetadata.model_validate_json(file_contents) + + def get_db(self, doc_slug: str) -> dict[str, str]: + """Fetch the contents of the pages in the index.""" + file_contents = self.read_doc_file(doc_slug, "db.json") + + return TypeAdapter(dict[str, str]).validate_json(file_contents) diff --git a/src/devdocs2zim/constants.py b/src/devdocs2zim/constants.py new file mode 100644 index 0000000..d5f92cd --- /dev/null +++ b/src/devdocs2zim/constants.py @@ -0,0 +1,15 @@ +import logging + +from zimscraperlib.logging import ( # pyright: ignore[reportMissingTypeStubs] + getLogger, # pyright: ignore[reportUnknownVariableType] +) + +from devdocs2zim.__about__ import __version__ + +NAME = "devdocs2zim" +VERSION = __version__ + +DEVDOCS_FRONTEND_URL = "https://devdocs.io" +DEVDOCS_DOCUMENTS_URL = "https://documents.devdocs.io" + +logger = getLogger(NAME, level=logging.DEBUG) diff --git a/tests/test_client.py b/tests/test_client.py new file mode 100644 index 0000000..cee5dc4 --- /dev/null +++ b/tests/test_client.py @@ -0,0 +1,391 @@ +from unittest import TestCase +from unittest.mock import ANY, create_autospec, patch + +from pydantic import TypeAdapter +from requests import HTTPError, Response + +from devdocs2zim.client import ( + DevdocsClient, + DevdocsIndex, + DevdocsIndexEntry, + DevdocsIndexType, + DevdocsMetadata, + DevdocsMetadataLinks, +) + +# NOTE: Deserializataion tests in this file are performed against the full object +# to ensure additions of new fields will cause all relevant tests to fail. + + +class TestDevdocsMetadataLinks(TestCase): + def test_unmarshal_minimal(self): + links = DevdocsMetadataLinks.model_validate_json(r"{}") + + self.assertEqual( + DevdocsMetadataLinks( + home="", + code="", + ), + links, + ) + + def test_unmarshal_full(self): + links = DevdocsMetadataLinks.model_validate_json( + r'{"code":"https://code.code", "home":"https://home.home"}' + ) + + self.assertEqual( + DevdocsMetadataLinks( + home="https://home.home", + code="https://code.code", + ), + links, + ) + + +class TestDevdocsMetadata(TestCase): + def test_unmarshal_minimal(self): + metadata = DevdocsMetadata.model_validate_json( + r""" + { + "name": "MyLanguage", + "slug": "mylanguage~3.14" + } + """ + ) + + self.assertEqual( + DevdocsMetadata(name="MyLanguage", slug="mylanguage~3.14"), + metadata, + ) + + def test_unmarshal_full(self): + # Example fetched from https://devdocs.io/docs.json on 2024-07-17. + # Attribution line modified for brevity. + metadata = DevdocsMetadata.model_validate_json( + r""" + { + "name": "Kubernetes", + "slug": "kubernetes~1.28", + "type": "kubernetes", + "links": { + "home": "https://kubernetes.io/", + "code": "https://github.com/kubernetes/kubernetes" + }, + "version": "1.28", + "release": "1.28", + "mtime": 1707071525, + "db_size": 951091, + "attribution": "© 2022 The Kubernetes Authors" + } + """ + ) + + self.assertEqual( + DevdocsMetadata( + name="Kubernetes", + slug="kubernetes~1.28", + links=DevdocsMetadataLinks( + home="https://kubernetes.io/", + code="https://github.com/kubernetes/kubernetes", + ), + release="1.28", + version="1.28", + attribution="© 2022 The Kubernetes Authors", + ), + metadata, + ) + + def test_slug_without_version_no_version(self): + metadata = DevdocsMetadata(name="test", slug="test") + + self.assertEqual("test", metadata.slug_without_version) + + def test_slug_without_version_version(self): + metadata = DevdocsMetadata(name="test", slug="test~1.23") + + self.assertEqual("test", metadata.slug_without_version) + + def test_placeholders_minimal(self): + metadata = DevdocsMetadata(name="test", slug="test~1.23") + + placeholders = metadata.placeholders() + + self.assertEqual( + { + "name": "test", + "full_name": "test", + "slug": "test~1.23", + "version": "", + "release": "", + "attribution": "", + "home_link": "", + "code_link": "", + "slug_without_version": "test", + }, + placeholders, + ) + + def test_placeholders_full(self): + metadata = DevdocsMetadata( + name="Kubernetes", + slug="kubernetes~1.28", + links=DevdocsMetadataLinks( + home="https://kubernetes.io/", + code="https://github.com/kubernetes/kubernetes", + ), + release="1.28", + version="1.28.1", + attribution="© 2022 The Kubernetes Authors", + ) + + placeholders = metadata.placeholders() + + self.assertEqual( + { + "name": "Kubernetes", + "full_name": "Kubernetes 1.28.1", + "slug": "kubernetes~1.28", + "version": "1.28.1", + "release": "1.28", + "attribution": "© 2022 The Kubernetes Authors", + "home_link": "https://kubernetes.io/", + "code_link": "https://github.com/kubernetes/kubernetes", + "slug_without_version": "kubernetes", + }, + placeholders, + ) + + +class TestDevdocsIndexEntry(TestCase): + def test_unmarshal(self): + entry = DevdocsIndexEntry.model_validate_json( + r""" + { + "name": "Accept-Encoding", + "path": "headers/accept-encoding", + "type": "Headers" + } + """ + ) + + self.assertEqual( + DevdocsIndexEntry( + name="Accept-Encoding", + path="headers/accept-encoding", + type="Headers", + ), + entry, + ) + + def test_path_without_fragment_no_fragment(self): + entry = DevdocsIndexEntry( + name="Test", + path="test", + type="TestCategory", + ) + + self.assertEqual( + "test", + entry.path_without_fragment, + ) + + def test_path_without_fragment_has_fragment(self): + entry = DevdocsIndexEntry( + name="Test", + path="test#some-fragment", + type="TestCategory", + ) + + self.assertEqual( + "test", + entry.path_without_fragment, + ) + + +class TestDevdocsIndexType(TestCase): + def test_unmarshal(self): + index_type = DevdocsIndexType.model_validate_json( + r""" + { + "name": "Headers", + "count": 145, + "slug": "headers" + } + """ + ) + + self.assertEqual( + DevdocsIndexType( + name="Headers", + count=145, + slug="headers", + ), + index_type, + ) + + +class TestDevdocsIndex(TestCase): + def test_unmarshal_minimal(self): + index = DevdocsIndex.model_validate_json(r"""{"entries": [],"types": []}""") + + self.assertEqual( + DevdocsIndex( + entries=[], + types=[], + ), + index, + ) + + def test_unmarshal(self): + index = DevdocsIndex.model_validate_json( + r""" + { + "entries": [{ + "name": "Accept-Encoding", + "path": "headers/accept-encoding", + "type": "Headers" + }], + "types": [{ + "name": "Headers", + "count": 145, + "slug": "headers" + }] + } + """ + ) + + self.assertEqual( + DevdocsIndex( + entries=[ + DevdocsIndexEntry( + name="Accept-Encoding", + path="headers/accept-encoding", + type="Headers", + ), + ], + types=[ + DevdocsIndexType( + name="Headers", + count=145, + slug="headers", + ), + ], + ), + index, + ) + + +class TestDevdocsClient(TestCase): + def setUp(self): + self.client = DevdocsClient( + documents_url="https://docs.docs", + frontend_url="https://frontend.frontend", + ) + + self.requests_patcher = patch("devdocs2zim.client.requests", autospec=True) + self.mock_requests = self.requests_patcher.start() + self.mock_response = create_autospec(Response) + self.mock_requests.get.return_value = self.mock_response + + def tearDown(self): + self.requests_patcher.stop() + + def test_read_frontend_file_normal(self): + self.mock_response.text = "file-contents" + + contents = self.client.read_frontend_file("path/to/foo.txt") + + self.assertEqual("file-contents", contents) + self.mock_requests.get.assert_called_with( + url="https://frontend.frontend/path/to/foo.txt", + allow_redirects=True, + timeout=ANY, + ) + + def test_read_frontend_file_errors(self): + self.mock_response.raise_for_status.side_effect = HTTPError("test error") + + self.assertRaises(HTTPError, self.client.read_frontend_file, "path/to/foo.txt") + + def test_read_doc_file_normal(self): + self.mock_response.text = "file-contents" + + contents = self.client.read_doc_file("html", "index.json") + + self.assertEqual("file-contents", contents) + self.mock_requests.get.assert_called_with( + url="https://docs.docs/html/index.json", + allow_redirects=True, + timeout=ANY, + ) + + def test_read_doc_file_errors(self): + self.mock_response.raise_for_status.side_effect = HTTPError("test error") + + self.assertRaises(HTTPError, self.client.read_doc_file, "html", "index.json") + + def test_read_application_css(self): + self.mock_response.text = "some-css" + + contents = self.client.read_application_css() + + self.assertEqual("some-css", contents) + self.mock_requests.get.assert_called_with( + url="https://frontend.frontend/application.css", + allow_redirects=ANY, + timeout=ANY, + ) + + def test_list_docs(self): + want_docs = [ + DevdocsMetadata(name="MyLang V1", slug="mylang~1.0"), + DevdocsMetadata(name="MyLang V2", slug="mylang~2.0"), + ] + self.mock_response.text = ( + TypeAdapter(list[DevdocsMetadata]).dump_json(want_docs).decode() + ) + + got_docs = self.client.list_docs() + + self.assertEqual(want_docs, got_docs) + self.mock_requests.get.assert_called_with( + url="https://frontend.frontend/docs.json", allow_redirects=ANY, timeout=ANY + ) + + def test_get_index(self): + want = DevdocsIndex(entries=[], types=[]) + self.mock_response.text = want.model_dump_json() + + got = self.client.get_index("mylang~1.0") + + self.assertEqual(want, got) + self.mock_requests.get.assert_called_with( + url="https://docs.docs/mylang~1.0/index.json", + allow_redirects=ANY, + timeout=ANY, + ) + + def test_get_meta(self): + want = DevdocsMetadata(name="MyLang V1", slug="mylang~1.0") + self.mock_response.text = want.model_dump_json() + + got = self.client.get_meta("mylang~1.0") + + self.assertEqual(want, got) + self.mock_requests.get.assert_called_with( + url="https://docs.docs/mylang~1.0/meta.json", + allow_redirects=ANY, + timeout=ANY, + ) + + def test_get_db(self): + want = {"index": "data"} + self.mock_response.text = TypeAdapter(dict[str, str]).dump_json(want).decode() + + got = self.client.get_db("mylang~1.0") + + self.assertEqual(want, got) + self.mock_requests.get.assert_called_with( + url="https://docs.docs/mylang~1.0/db.json", allow_redirects=ANY, timeout=ANY + )