diff --git a/.github/workflows/Publish.yaml b/.github/workflows/Publish.yaml index 9565f2bc..21ec12cb 100644 --- a/.github/workflows/Publish.yaml +++ b/.github/workflows/Publish.yaml @@ -24,17 +24,6 @@ jobs: pip install -U pip pip install -e .[scripts] - - name: Generate fuzzy rules - run: python rules/generate_rules.py - - - name: Build Javascript wombatSetup.js - uses: addnab/docker-run-action@v3 - with: - image: node:20-bookworm - options: -v ${{ github.workspace }}/src/warc2zim/statics:/output -v ${{ github.workspace }}/rules:/src/rules -v ${{ github.workspace }}/javascript:/src/javascript -v ${{ github.workspace }}/build_js.sh:/src/build_js.sh - run: | - /src/build_js.sh - - name: Build packages run: | pip install -U pip build diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index bbf4180d..59500d8b 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -12,14 +12,6 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Build Javascript wombatSetup.js - uses: addnab/docker-run-action@v3 - with: - image: node:20-bookworm - options: -v ${{ github.workspace }}/src/warc2zim/statics:/output -v ${{ github.workspace }}/rules:/src/rules -v ${{ github.workspace }}/javascript:/src/javascript -v ${{ github.workspace }}/build_js.sh:/src/build_js.sh - run: | - /src/build_js.sh - - name: Build and push Docker image uses: openzim/docker-publish-action@v10 with: diff --git a/.github/workflows/QA.yaml b/.github/workflows/QA.yaml index 95c24a72..9cd2fea3 100644 --- a/.github/workflows/QA.yaml +++ b/.github/workflows/QA.yaml @@ -24,9 +24,6 @@ jobs: pip install -U pip pip install -e .[lint,scripts,test,check] - - name: Generate fuzzy rules - run: python rules/generate_rules.py - - name: Check black formatting run: inv lint-black @@ -35,20 +32,3 @@ jobs: - name: Check pyright run: inv check-pyright - - - name: Set up Node.JS - uses: actions/setup-node@v4 - with: - node-version: 20 - - - name: Install JS dependencies - working-directory: javascript - run: yarn install - - - name: Check prettier formatting - working-directory: javascript - run: yarn prettier-check - - - name: Check eslint rules - working-directory: javascript - run: yarn eslint diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index a8a780e6..d0a1a4de 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -24,9 +24,6 @@ jobs: pip install -U pip pip install -e .[test,scripts] - - name: Generate fuzzy rules - run: python rules/generate_rules.py - - name: Run the tests run: inv coverage --args "-vvv" @@ -35,19 +32,6 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} - - name: Set up Node.JS - uses: actions/setup-node@v4 - with: - node-version: 20 - - - name: Install JS dependencies - working-directory: javascript - run: yarn install - - - name: Run JS tests - working-directory: javascript - run: yarn test - build_python: runs-on: ubuntu-22.04 steps: @@ -59,14 +43,6 @@ jobs: python-version-file: pyproject.toml architecture: x64 - - name: Install dependencies (and project) - run: | - pip install -U pip build - pip install -e .[scripts] - - - name: Generate fuzzy rules - run: python rules/generate_rules.py - - name: Ensure we can build Python targets run: | pip install -U pip build diff --git a/.gitignore b/.gitignore index 7421867b..7da2e3fd 100644 --- a/.gitignore +++ b/.gitignore @@ -495,18 +495,6 @@ pyrightconfig.json # ignore all vscode, this is not standard configuration in this place .vscode -# installed at build time -src/warc2zim/statics/wombat.js - # temporary directories used during development output tmp - -# rule files are generated by rules/generate_rules.py -src/warc2zim/rules.py -tests/test_fuzzy_rules.py -javascript/src/fuzzyRules.js -javascript/test/fuzzyRules.js - -# wombatSetup.js is generated with rollup -src/warc2zim/statics/wombatSetup.js diff --git a/CHANGELOG.md b/CHANGELOG.md index ca369372..335fb374 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Upgrade dependencies: wombat 3.8.6 (#334), zimscraperlib 5.0.0, warcio 1.7.5, cdxj_index 1.4.6 and others +- Upgrade dependencies: zimscraperlib 5.0.0, warcio 1.7.5, cdxj_index 1.4.6 and others +- Use all rewriting stuff for zimscraperlib - Remove most HTML / CSS / JS rewriting logic which is now part of zimscraperlib 5 - Fix wombat setup settings (especially `isSW`) (#293) diff --git a/Dockerfile b/Dockerfile index 48e7e893..89069148 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ FROM python:3.12-slim-bookworm -LABEL org.opencontainers.image.source https://github.com/openzim/warc2zim +LABEL org.opencontainers.image.source=https://github.com/openzim/warc2zim RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ @@ -12,15 +12,13 @@ RUN apt-get update -y \ WORKDIR /output # Copy pyproject.toml and its dependencies -COPY pyproject.toml openzim.toml README.md /src/ -COPY rules/generate_rules.py /src/rules/generate_rules.py +COPY pyproject.toml README.md /src/ COPY src/warc2zim/__about__.py /src/src/warc2zim/__about__.py # Install Python dependencies RUN pip install --no-cache-dir /src # Copy code + associated artifacts -COPY rules /src/rules COPY src /src/src COPY *.md /src/ diff --git a/README.md b/README.md index 25aefbec..23ac52d2 100644 --- a/README.md +++ b/README.md @@ -168,26 +168,13 @@ Start a hatch shell: this will install software including dependencies in an iso hatch shell ``` -### Regenerate wombatSetup.js +### Rewriting logic and rewriting rules -wombatSetup.js is the JS code used to setup wombat when the ZIM is used. +Mostly all rewriting logic and rewriting rules now comes from the [python-scraperlib](https://github.com/openzim/python-scraperlib/). -It is normally retrieved by Python build process (see openzim.toml for details). +Should you need to add more rules or modify rewriting logic, this is the place to go. -Recommended solution to develop this JS code is to install Node.JS on your system, and then - -```bash -cd javascript -yarn build-dev # or yarn build-prod -``` - -Should you want to regenerate this code without install Node.JS, you might simply run following command. - -```bash -docker run -v $PWD/src/warc2zim/statics:/output -v $PWD/rules:/src/rules -v $PWD/javascript:/src/javascript -v $PWD/build_js.sh:/src/build_js.sh -it --rm --entrypoint /src/build_js.sh node:20-bookworm -``` - -It will install Python3 on-top of Node.JS in a Docker container, generate JS fuzzy rules and bundle JS code straight to `/src/warc2zim/statics/wombatSetup.js` where the file is expected to be placed. +All resulting code (Python and Javascript) as well as wombat.js and wombat-setup.js comes from the python-scraperlib. ## License diff --git a/build_js.sh b/build_js.sh deleted file mode 100755 index 16d1ec1e..00000000 --- a/build_js.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# Custom script to install Python on top of a Docker Node-JS image, then install -# required Python deps, generate fuzzy rules, and finally bundle JS script - -apt-get update -y - -apt-get install -y --no-install-recommends \ - python3 python3-pip python3-venv - -rm -rf /var/lib/apt/lists/* - -python3 -m venv /local - -/local/bin/python -m pip install --no-cache-dir -U \ - pip \ - jinja2==3.1.4 \ - PyYAML==6.0.2 - -/local/bin/python /src/rules/generate_rules.py - -cd /src/javascript - -yarn install - -OUTPUT_DIR=/output yarn build-prod diff --git a/docs/functional_architecture.md b/docs/functional_architecture.md deleted file mode 100644 index dfb75b50..00000000 --- a/docs/functional_architecture.md +++ /dev/null @@ -1,84 +0,0 @@ -# Functional architecture - -## Foreword - -At a high level, warc2zim is a piece of software capable to transform a set of WARC files into one ZIM file. From a functional point of view, it is hence a "format converter". - -While warc2zim is typically used as a sub-component of zimit, where WARC files are produced by Browsertrix crawler, it is in fact agnostic of this fact and could process any WARC file adhering to the standard. - -This documentation will describe the big functions achieved by warc2zim codebase. It is important to note that these functions are not seggregated inside the codebase with frontiers. - -## ZIM storage - -While storing the web resources in the ZIM is mostly straightforward (we just transfer the raw bytes, after some modification for URL rewriting if needed), the decision of the path where the resource will be stored is very important. - -This is purely conventional, even if ZIM specification has to be respected for proper operation in readers. - -This function is responsible to compute the ZIM path where a given web resource is going to be stored. - -While the URL is the only driver of this computation for now, warc2zim might have to consider other contextual data in the future. E.g. the resource to serve might by dynamic, depending not only on URL query parameters but also header(s) value(s). - -## Fuzzy rules - -Unfortunately, it is not always possible / desirable to store the resource with a simple transformation. - -A typical situation is that some query parameters are dynamically computed by some Javascript code to include user tracking identifier, current datetime information, ... - -When running again the same javascript code inside the ZIM, the URL will hence be slightly different because context has changed, but the same content needs to be retrieved. - -warc2zim hence relies on fuzzy rules to transform/simplify some URLs when computing the ZIM path. - -## URL Rewriting - -warc2zim transforms (rewrites) URLs found in documents (HTML, CSS, JS, ...) so that they are usable inside the ZIM. - -### General case - -One simple example is that we might have following code in an HTML document to load an image with an absolute URL: - -``` - -``` - -The URL `https://en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` has to be transformed to a URL that it is usable inside the ZIM. - -For proper reader operation, openZIM prohibits using absolute URLs, so this has to be a relative URL. This relative URL is hence dependant on the location of the resource currently being rewriten. - -The table below gives some examples of what the rewritten URL is going to be, depending on the URL of the rewritten document. - -| HTML document URL | image URL rewritten for usage inside the ZIM | -|--|--| -| `https://en.wikipedia.org/wiki/Kiwix` | `./File:Kiwix_logo_v3.svg` | -| `https://en.wikipedia.org/wiki` | `./wiki/File:Kiwix_logo_v3.svg` | -| `https://en.wikipedia.org/waka/Kiwix` | `../wiki/File:Kiwix_logo_v3.svg` | -| `https://fr.wikipedia.org/wiki/Kiwix` | `../../en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` | - -As can be seen on the last line (but this is true for all URLs), this rewriting has to take into account the convention saying at which ZIM path a given web resource will be stored. - -### Dynamic case - -The explanation above more or less assumed that the transformations can be done statically, i.e warc2zim can open every known document, find existing URLs and replace them with their counterpart inside the ZIM. - -While this is possible for HTML and CSS documents typically, it is not possible when the URL is dynamically computed. This is typically the case for JS documents, where in the general case the URL is not statically stored inside the JS code but computed on-the-fly by aggregating various strings and values. - -Rewriting these computations is not deemed feasible due to the huge variety of situation which might be encountered. - -A specific function is hence needed to rewrite URL **live in client browser**, intercept any function triggering a web request, transform the URL according to conventions (where we expect the resource to be located in the general case) and fuzzy rules. - -_Spoiler: this is where we will rely on wombat.js from webrecorder team, since this dynamic interception is quite complex and already done quite neatly by them_ - -### Fuzzy rules - -The same fuzzy rules that have been used to compute the ZIM path from a resource URL have to be applied again when rewriting URLs. - -While this is expected to serve mostly for the dynamic case, we still applies them on both side (staticaly and dynamicaly) for coherency. - -## Documents rewriten statically - -For now warc2zim rewrites HTML, CSS and JS documents. For CSS and JS, this mainly consists in replacing URLs. For HTML, we also have more specific rewritting necessary (e.g. to handle base href or redirects with meta). - -Since 2.1, no domain specific (DS) rules are applied like it is done in wabac.JS because these rules are already applied in Browsertrix Crawler. For the same reason, JSON is not rewritten anymore (URL do not need to be rewritten in JSON because these URLs will be used by JS, intercepted by wombat and dynamically rewritten). - -JSONP callbacks are supposed to be rewritten but this has not been heavily tested. - -Other types of documents are supposed to be either not feasible / not worth it (e.g. URLs inside PDF documents), meaningless (e.g. images, fonts) or planned for later due to limited usage in the wild (e.g. XML). diff --git a/docs/software_architecture.md b/docs/software_architecture.md deleted file mode 100644 index 4a12b56c..00000000 --- a/docs/software_architecture.md +++ /dev/null @@ -1,48 +0,0 @@ -# Software architecture - -## HTML rewriting - -HTML rewriting is purely static (i.e. before resources are written to the ZIM). HTML code is parsed with the [HTML parser from Python standard library](https://docs.python.org/3/library/html.parser.html). - -A small header script is inserted in HTML code to initialize wombat.js which will wrap all JS APIs to dynamically rewrite URLs comming from JS. - -This header script is generated using [Jinja2](https://pypi.org/project/Jinja2/) template since it needs to populate some JS context variables needed by wombat.js operations (original scheme, original url, ...). - -## CSS rewriting - -CSS rewriting is purely static (i.e. before resources are written to the ZIM). CSS code is parsed with the [tinycss2 Python library](https://pypi.org/project/tinycss2/). - -## JS rewriting - -### Static - -Static JS rewriting is simply a matter of pure textual manipulation with regular expressions. No parsing is done at all. - -### Dynamic - -Dynamic JS rewriting is done with [wombat JS library](https://github.com/webrecorder/wombat). The same fuzzy rules that are used for static rewritting are injected into wombat configuration. Code to rewrite URLs is an adapted version of the code used to compute ZIM paths. - -For wombat setup, including the URL rewriting part, we need to pass wombat configuration info. This code is developed in the `javascript` folder. For URL parsing, it relies on the [uri-js library](https://www.npmjs.com/package/uri-js). This javascript code is bundled into a single `wombatSetup.js` file with [rollup bundler](https://rollupjs.org), the same bundler used by webrecorder team to bundle wombat. - -## cdxj_indexer and warcio - -[cdxj_indexer Python library](https://pypi.org/project/cdxj-indexer/) is a thin wrapper over [warcio Python library](https://pypi.org/project/warcio/). It used to iterate all record in WARCs. - -It provide two main features: - -- Loop over several WARCs in a directory (A visit of a website may be stored in several WARCs in the same directory). -- Provide a buffered access to warcs content (and not a "stream" (fileio) only api) (but monkey patching returned WarcRecord. - -Except that, scraper directly uses WarcRecord (returned by cdxj_indexer, implemented in warcio) to access metadata and such. - -## zimscraperlib - -[zimscraperlib Python library](https://pypi.org/project/zimscraperlib) is used for ZIM operations. - -## requests - -[requests Python library](https://pypi.org/project/requests/) is used to retrieve the custom CSS file when a URL is passed. - -## brotlipy - -[brotlipy Python library](https://pypi.org/project/brotlipy/) is used to access brotli content in WARC records (not part of warcio because it is an optional dependency). diff --git a/docs/technical_architecture.md b/docs/technical_architecture.md deleted file mode 100644 index 61a2586f..00000000 --- a/docs/technical_architecture.md +++ /dev/null @@ -1,100 +0,0 @@ -# Technical architecture - -## Fuzzy rules - -Fuzzy rules are stored in `rules/rules.yaml`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code. - -Should you update these fuzzy rules, you hence have to: -- regenerate Python and JS files by running `python rules/generateRules.py` -- bundle again Javascript `wombatSetup.js` (see below). - -## Wombat configuration - -Wombat configuration contains some static configuration and the dynamic URL rewriting, including fuzzy rules. - -It is bundled by rollup with `cd javascript && yarn build-prod` and the result is pushed to proper scraper location for inclusion at build time. - -Tests are available and run with `cd javascript && yarn test`. - -## Scraper operations - -### High level overview - -The scraper behavior is done in two phases. - -First the WARC records are iterated to compute the ZIM metadata (find main path, favicon, ...) and detect which ZIM paths are expected to be populated. This is mandatory to know when we will rewrite the documents if the URLs we will encounter leads to something which is internal (inside the ZIM) and should be rewriten or external and should be kept as-is. - -Second, the WARC records are iterated to be transformed and appended inside the ZIM. ZIM records are appended to the ZIM on the fly. - -In both phases, WARC records are iterated in natural order, i.e. as they have been retrieved online during the crawl. - -### Transformation of URL into ZIM path - -Transforming a URL into a ZIM path has to respect the ZIM specification: path must not be url-encoded (i.e. it must be decoded) and it must be stored as UTF-8. - -WARC record stores the items URL inside a header named "WARC-Target-URI". The value inside this header is encoded, or more exactly it is "exactly what the browser sent at the HTTP level" (see https://github.com/webrecorder/browsertrix-crawler/issues/492 for more details). - -It has been decided (by convention) that we will drop the scheme, the port, the username and password from the URL. Headers are also not considered in this computation. - -Computation of the ZIM path is hence mostly straightforward: -- decode the hostname which is puny-encoded -- decode the path and query parameter which might be url-encoded - -## Rewriting documents - -Some documents (HTML, CSS, JS and JSON for now) needs to be rewritten, e.g. to rewrite URLs, adapt some code to the ZIM context, ... - -The first important step when processing a WARC entry to add it as a ZIM entry is hence to properly detect which kind of document we are dealing with. - -This is done in the `get_rewrite_mode` function of the `Rewriter` class. Before 2.0.1, scraper was relying only on mimetype as returned in `Content-Type` HTTP response. - -Unfortunately, this caused problems where some server are returning wrong information is this header, e.g. Cloudflare seems to frequently return `text/html` for woff2 fonts ; this causes the scraper to fail, because it is impossible to know in advance that we should ignore these errors, we could have a real document which should be rewriten but is failing. - -Since 2.0.1, we've enriched the logic by using the new WARC header `WARC-Resource-Type` which contains the type of resources "as perceived by the browser" (from https://chromedevtools.github.io/devtools-protocol/tot/Network/#type-ResourceType, see https://github.com/webrecorder/browsertrix-crawler/pull/481). Unfortunately this information is not sufficient because of some very generic value returned like `fetch` or `xhr`. Scraper stills need to mix this information with the mimetype. Ideally, we would have prefer to find a single source of truth not relying on something returned by the server, but it is not available for now (see https://github.com/openzim/warc2zim/issues/340 for a discussion on this topic). - -### URL rewriting - -In addition to the computation of the relative path from the current document URL to the URL to rewrite, URL rewriting also consists in computing the proper ZIM path (with same operation as above) and properly encoding it so that the resulting URL respects [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986). Some important stuff has to be noted in this encoding. - -- since the original hostname is now part of the path, it will now be url-encoded -- since the `?` and following query parameters are also part of the path (we do not want readers to drop them like kiwix-serve would do), they are also url-encoded - -Below is an example case of the rewrite operation on an image URL found in an HTML document. - -- Document original URL: `https://kiwix.org/a/article/document.html` -- Document ZIM path: `kiwix.org/a/article/document.html` -- Image original URL: `//xn--exmple-cva.com/a/resource/image.png?foo=bar` -- Image rewritten URL: `../../../ex%C3%A9mple.com/a/resource/image.png%3Ffoo%3Dbar` -- Image ZIM Path: `exémple.com/a/resource/image.png?foo=bar` - -### JS Rewriting - -JS Rewriting is a bit special because rules to apply are different wether we are using "classic" Javascript or "module" Javascript. - -Detection of Javascript modules starts at the HTML level where we have a `