From 9926da51745ef100b5ef7ec4742779828f5827ff Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 12:21:11 +0100 Subject: [PATCH 01/10] Move url rewriting feature in a separated module. --- src/warc2zim/main.py | 54 +--------------------------- src/warc2zim/url_rewriting.py | 66 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 53 deletions(-) create mode 100644 src/warc2zim/url_rewriting.py diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 94558338..74f40f60 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -53,6 +53,7 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter +from .url_rewriting import FUZZY_RULES, canonicalize # Shared logger logger = logging.getLogger("warc2zim") @@ -74,45 +75,6 @@ # Default ZIM metadata tags DEFAULT_TAGS = ["_ftindex:yes", "_category:other", "_sw:yes"] - -FUZZY_RULES = [ - { - "match": re.compile( - # r"//.*googlevideo.com/(videoplayback\?).*(id=[^&]+).*([&]itag=[^&]+).*" - r"//.*googlevideo.com/(videoplayback\?).*((?<=[?&])id=[^&]+).*" - ), - "replace": r"//youtube.fuzzy.replayweb.page/\1\2", - }, - { - "match": re.compile( - r"//(?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?)" - r".*(video_id=[^&]+).*" - ), - "replace": r"//youtube.fuzzy.replayweb.page/\1\2", - }, - {"match": re.compile(r"(\.[^?]+\?)[\d]+$"), "replace": r"\1"}, - { - "match": re.compile( - r"//(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*" - ), - "replace": r"//youtube.fuzzy.replayweb.page/\1?\2", - }, - { - "match": re.compile(r"//(?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*"), - "replace": r"//youtube.fuzzy.replayweb.page/embed/\1", - }, - { - "match": re.compile( - r".*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+.mp4)$" - ), - "replace": r"vimeo-cdn.fuzzy.replayweb.page/\1", - }, - { - "match": re.compile(r".*player.vimeo.com/(video/[\d]+)\?.*"), - "replace": r"vimeo.fuzzy.replayweb.page/\1", - }, -] - CUSTOM_CSS_URL = "https://warc2zim.kiwix.app/custom.css" DUPLICATE_EXC_STR = re.compile( @@ -811,20 +773,6 @@ def warc2zim(args=None): return warc2zim.run() -# ============================================================================ -def canonicalize(url): - """Return a 'canonical' version of the url under which it is stored in the ZIM - For now, just removing the scheme http:// or https:// scheme - """ - if url.startswith("https://"): - return url[8:] - - if url.startswith("http://"): - return url[7:] - - return url - - # ============================================================================ def get_version(): return pkg_resources.get_distribution("warc2zim").version diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py new file mode 100644 index 00000000..b9595176 --- /dev/null +++ b/src/warc2zim/url_rewriting.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ai ts=4 sts=4 et sw=4 nu + +""" warc2zim's url rewriting tools + +This module is about url and entry path rewriting. +""" + +import logging +import re + +# Shared logger +logger = logging.getLogger("warc2zim.url_rewriting") + + +FUZZY_RULES = [ + { + "match": re.compile( + # r"//.*googlevideo.com/(videoplayback\?).*(id=[^&]+).*([&]itag=[^&]+).*" + r"//.*googlevideo.com/(videoplayback\?).*((?<=[?&])id=[^&]+).*" + ), + "replace": r"//youtube.fuzzy.replayweb.page/\1\2", + }, + { + "match": re.compile( + r"//(?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?)" + r".*(video_id=[^&]+).*" + ), + "replace": r"//youtube.fuzzy.replayweb.page/\1\2", + }, + {"match": re.compile(r"(\.[^?]+\?)[\d]+$"), "replace": r"\1"}, + { + "match": re.compile( + r"//(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*" + ), + "replace": r"//youtube.fuzzy.replayweb.page/\1?\2", + }, + { + "match": re.compile(r"//(?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*"), + "replace": r"//youtube.fuzzy.replayweb.page/embed/\1", + }, + { + "match": re.compile( + r".*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+.mp4)$" + ), + "replace": r"vimeo-cdn.fuzzy.replayweb.page/\1", + }, + { + "match": re.compile(r".*player.vimeo.com/(video/[\d]+)\?.*"), + "replace": r"vimeo.fuzzy.replayweb.page/\1", + }, +] + + +def canonicalize(url): + """Return a 'canonical' version of the url under which it is stored in the ZIM + For now, just removing the scheme http:// or https:// scheme + """ + if url.startswith("https://"): + return url[8:] + + if url.startswith("http://"): + return url[7:] + + return url From b9d6287d86c366f7b0f1bdb3848129e1d5fab2fe Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 28 Nov 2023 15:17:00 +0100 Subject: [PATCH 02/10] fixup! Move url rewriting feature in a separated module. --- src/warc2zim/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 74f40f60..2f56a768 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -53,7 +53,7 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter -from .url_rewriting import FUZZY_RULES, canonicalize +from warc2zim.url_rewriting import FUZZY_RULES, canonicalize # Shared logger logger = logging.getLogger("warc2zim") From a29149e2d369e8ef9bcd62b4b1a3fb004776e709 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 14:43:11 +0100 Subject: [PATCH 03/10] Move creator items into `items.py`. --- src/warc2zim/items.py | 154 ++++++++++++++++++++++++++++++++++++++++++ src/warc2zim/main.py | 148 +++------------------------------------- 2 files changed, 163 insertions(+), 139 deletions(-) create mode 100644 src/warc2zim/items.py diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py new file mode 100644 index 00000000..bf888a4b --- /dev/null +++ b/src/warc2zim/items.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ai ts=4 sts=4 et sw=4 nu + +""" warc2zim's item classes + +This module contains the differents Item we may want to add to a Zim archive. +""" + +import logging +import re + +import pkg_resources +from libzim.writer import Hint +from zimscraperlib.types import get_mime_for_name +from zimscraperlib.zim.items import StaticItem +from zimscraperlib.zim.providers import StringProvider + +from bs4 import BeautifulSoup + +from .url_rewriting import canonicalize + +# Shared logger +logger = logging.getLogger("warc2zim.items") + +# external sw.js filename +SW_JS = "sw.js" + +HEAD_INS = re.compile(b"()", re.I) +CSS_INS = re.compile(b"()", re.I) + + +class WARCHeadersItem(StaticItem): + """WARCHeadersItem used to store the WARC + HTTP headers as text + Usually stored under H namespace + """ + + def __init__(self, record): + super().__init__() + self.record = record + self.url = get_record_url(record) + + def get_path(self): + return "H/" + canonicalize(self.url) + + def get_title(self): + return "" + + def get_mimetype(self): + return "application/warc-headers" + + def get_hints(self): + return {Hint.FRONT_ARTICLE: False} + + def get_contentprovider(self): + # add WARC headers + buff = self.record.rec_headers.to_bytes(encoding="utf-8") + # add HTTP headers, if present + if self.record.http_headers: + buff += self.record.http_headers.to_bytes(encoding="utf-8") + + return StringProvider(content=buff, ref=self) + + +class WARCPayloadItem(StaticItem): + """WARCPayloadItem used to store the WARC payload + Usually stored under A namespace + """ + + def __init__(self, record, head_insert=None, css_insert=None): + super().__init__() + self.record = record + self.url = get_record_url(record) + self.mimetype = get_record_mime_type(record) + self.title = "" + + if hasattr(self.record, "buffered_stream"): + self.record.buffered_stream.seek(0) + self.content = self.record.buffered_stream.read() + else: + self.content = self.record.content_stream().read() + + if self.mimetype.startswith("text/html"): + self.title = parse_title(self.content) + if head_insert: + self.content = HEAD_INS.sub(head_insert, self.content) + if css_insert: + self.content = CSS_INS.sub(css_insert, self.content) + + def get_path(self): + return "A/" + canonicalize(self.url) + + def get_title(self): + return self.title + + def get_hints(self): + is_front = self.mimetype.startswith("text/html") + return {Hint.FRONT_ARTICLE: is_front} + + +class StaticArticle(StaticItem): + def __init__(self, env, filename, main_url, **kwargs): + super().__init__(**kwargs) + self.filename = filename + self.main_url = main_url + + self.mime = get_mime_for_name(filename) + self.mime = self.mime or "application/octet-stream" + + if filename != SW_JS: + template = env.get_template(filename) + self.content = template.render(MAIN_URL=self.main_url) + else: + self.content = pkg_resources.resource_string( + "warc2zim", "templates/" + filename + ).decode("utf-8") + + def get_path(self): + return "A/" + self.filename + + def get_mimetype(self): + return self.mime + + def get_hints(self): + return {Hint.FRONT_ARTICLE: False} + + +def get_record_url(record): + """Check if record has url converted from POST/PUT, and if so, use that + otherwise return the target url""" + if hasattr(record, "urlkey"): + return record.urlkey + return record.rec_headers["WARC-Target-URI"] + + +def get_record_mime_type(record): + if record.http_headers: + # if the record has HTTP headers, use the Content-Type from those + # (eg. 'response' record) + content_type = record.http_headers["Content-Type"] + else: + # otherwise, use the Content-Type from WARC headers + content_type = record.rec_headers["Content-Type"] + + mime = content_type or "" + return mime.split(";")[0] + + +def parse_title(content): + try: + soup = BeautifulSoup(content, "html.parser") + return soup.title.text or "" + except Exception: + return "" diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 2f56a768..328d9bce 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -35,18 +35,15 @@ import pkg_resources import requests -from libzim.writer import Hint from warcio import ArchiveIterator, StatusAndHeaders from warcio.recordbuilder import RecordBuilder from zimscraperlib.constants import DEFAULT_DEV_ZIM_METADATA from zimscraperlib.download import stream_file -from zimscraperlib.types import get_mime_for_name from zimscraperlib.i18n import setlocale, get_language_details, Locale from zimscraperlib.image.convertion import convert_image from zimscraperlib.image.transformation import resize_image from zimscraperlib.zim.creator import Creator -from zimscraperlib.zim.items import StaticItem, URLItem -from zimscraperlib.zim.providers import StringProvider +from zimscraperlib.zim.items import URLItem from bs4 import BeautifulSoup from jinja2 import Environment, PackageLoader @@ -54,6 +51,14 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter from warc2zim.url_rewriting import FUZZY_RULES, canonicalize +from .items import ( + WARCHeadersItem, + WARCPayloadItem, + StaticArticle, + get_record_url, + get_record_mime_type, + parse_title, +) # Shared logger logger = logging.getLogger("warc2zim") @@ -67,11 +72,6 @@ # head insert template HEAD_INSERT_FILE = "sw_check.html" - -HEAD_INS = re.compile(b"()", re.I) -CSS_INS = re.compile(b"()", re.I) - - # Default ZIM metadata tags DEFAULT_TAGS = ["_ftindex:yes", "_category:other", "_sw:yes"] @@ -85,104 +85,6 @@ ) -# ============================================================================ -class WARCHeadersItem(StaticItem): - """WARCHeadersItem used to store the WARC + HTTP headers as text - Usually stored under H namespace - """ - - def __init__(self, record): - super().__init__() - self.record = record - self.url = get_record_url(record) - - def get_path(self): - return "H/" + canonicalize(self.url) - - def get_title(self): - return "" - - def get_mimetype(self): - return "application/warc-headers" - - def get_hints(self): - return {Hint.FRONT_ARTICLE: False} - - def get_contentprovider(self): - # add WARC headers - buff = self.record.rec_headers.to_bytes(encoding="utf-8") - # add HTTP headers, if present - if self.record.http_headers: - buff += self.record.http_headers.to_bytes(encoding="utf-8") - - return StringProvider(content=buff, ref=self) - - -# ============================================================================ -class WARCPayloadItem(StaticItem): - """WARCPayloadItem used to store the WARC payload - Usually stored under A namespace - """ - - def __init__(self, record, head_insert=None, css_insert=None): - super().__init__() - self.record = record - self.url = get_record_url(record) - self.mimetype = get_record_mime_type(record) - self.title = "" - - if hasattr(self.record, "buffered_stream"): - self.record.buffered_stream.seek(0) - self.content = self.record.buffered_stream.read() - else: - self.content = self.record.content_stream().read() - - if self.mimetype.startswith("text/html"): - self.title = parse_title(self.content) - if head_insert: - self.content = HEAD_INS.sub(head_insert, self.content) - if css_insert: - self.content = CSS_INS.sub(css_insert, self.content) - - def get_path(self): - return "A/" + canonicalize(self.url) - - def get_title(self): - return self.title - - def get_hints(self): - is_front = self.mimetype.startswith("text/html") - return {Hint.FRONT_ARTICLE: is_front} - - -# ============================================================================ -class StaticArticle(StaticItem): - def __init__(self, env, filename, main_url, **kwargs): - super().__init__(**kwargs) - self.filename = filename - self.main_url = main_url - - self.mime = get_mime_for_name(filename) - self.mime = self.mime or "application/octet-stream" - - if filename != SW_JS: - template = env.get_template(filename) - self.content = template.render(MAIN_URL=self.main_url) - else: - self.content = pkg_resources.resource_string( - "warc2zim", "templates/" + filename - ).decode("utf-8") - - def get_path(self): - return "A/" + self.filename - - def get_mimetype(self): - return self.mime - - def get_hints(self): - return {Hint.FRONT_ARTICLE: False} - - # ============================================================================ class WARC2Zim: def __init__(self, args): @@ -648,38 +550,6 @@ def add_fuzzy_match_record(self, url): logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url)) -# ============================================================================ -def get_record_url(record): - """Check if record has url converted from POST/PUT, and if so, use that - otherwise return the target url""" - if hasattr(record, "urlkey"): - return record.urlkey - return record.rec_headers["WARC-Target-URI"] - - -# ============================================================================ -def get_record_mime_type(record): - if record.http_headers: - # if the record has HTTP headers, use the Content-Type from those - # (eg. 'response' record) - content_type = record.http_headers["Content-Type"] - else: - # otherwise, use the Content-Type from WARC headers - content_type = record.rec_headers["Content-Type"] - - mime = content_type or "" - return mime.split(";")[0] - - -# ============================================================================ -def parse_title(content): - try: - soup = BeautifulSoup(content, "html.parser") - return soup.title.text or "" - except Exception: - return "" - - # ============================================================================ def iter_warc_records(inputs): """iter warc records, including appending request data to matching response""" From 30478705ff76900060f71de5ceb5416c894acb5d Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 28 Nov 2023 15:19:36 +0100 Subject: [PATCH 04/10] fixup! Move creator items into `items.py`. --- src/warc2zim/items.py | 2 +- src/warc2zim/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index bf888a4b..726bd956 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -18,7 +18,7 @@ from bs4 import BeautifulSoup -from .url_rewriting import canonicalize +from warc2zim.url_rewriting import canonicalize # Shared logger logger = logging.getLogger("warc2zim.items") diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 328d9bce..623f87d4 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -51,7 +51,7 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter from warc2zim.url_rewriting import FUZZY_RULES, canonicalize -from .items import ( +from warc2zim.items import ( WARCHeadersItem, WARCPayloadItem, StaticArticle, From 9a7f961e168100e30cc7a4002817df47a11722a5 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 14:56:23 +0100 Subject: [PATCH 05/10] Move `WARC2Zim` class to `converter.py` And rename it `Converter`. --- src/warc2zim/converter.py | 561 ++++++++++++++++++++++++++++++++++++++ src/warc2zim/main.py | 543 +----------------------------------- tests/test_warc_to_zim.py | 13 +- 3 files changed, 571 insertions(+), 546 deletions(-) create mode 100644 src/warc2zim/converter.py diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py new file mode 100644 index 00000000..66bb5141 --- /dev/null +++ b/src/warc2zim/converter.py @@ -0,0 +1,561 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ai ts=4 sts=4 et sw=4 nu + +""" warc2zim conversion utility + +This utility provides a conversion from WARC records to ZIM files. +The WARCs are converted in a 'lossless' way, no data from WARC records is lost. +Each WARC record results in two ZIM items: +- The WARC payload is stored under /A/ +- The WARC headers + HTTP headers are stored under the /H/ + +Given a WARC response record for 'https://example.com/', +two ZIM items are created /A/example.com/ and /H/example.com/ are created. + +Only WARC response and resource records are stored. + +If the WARC contains multiple entries for the same URL, only the first entry is added, +and later entries are ignored. A warning is printed as well. + +""" + +import os +import json +import pathlib +import logging +import tempfile +import datetime +import re +import io +import time +from urllib.parse import urlsplit, urljoin, urlunsplit, urldefrag + +import pkg_resources +import requests +from warcio import ArchiveIterator, StatusAndHeaders +from warcio.recordbuilder import RecordBuilder +from zimscraperlib.constants import DEFAULT_DEV_ZIM_METADATA +from zimscraperlib.download import stream_file +from zimscraperlib.i18n import setlocale, get_language_details, Locale +from zimscraperlib.image.convertion import convert_image +from zimscraperlib.image.transformation import resize_image +from zimscraperlib.zim.creator import Creator +from zimscraperlib.zim.items import URLItem + +from bs4 import BeautifulSoup + +from jinja2 import Environment, PackageLoader + +from cdxj_indexer import iter_file_or_dir, buffering_record_iter + +from .url_rewriting import FUZZY_RULES, canonicalize +from .items import ( + WARCHeadersItem, + WARCPayloadItem, + StaticArticle, + get_record_url, + get_record_mime_type, + parse_title, +) + +# Shared logger +logger = logging.getLogger("warc2zim.converter") + +# HTML mime types +HTML_TYPES = ("text/html", "application/xhtml", "application/xhtml+xml") + +# external sw.js filename +SW_JS = "sw.js" + +# head insert template +HEAD_INSERT_FILE = "sw_check.html" + +# Default ZIM metadata tags +DEFAULT_TAGS = ["_ftindex:yes", "_category:other", "_sw:yes"] + +CUSTOM_CSS_URL = "https://warc2zim.kiwix.app/custom.css" + +DUPLICATE_EXC_STR = re.compile( + r"^Impossible to add(.+)" + r"dirent\'s title to add is(.+)" + r"existing dirent's title is(.+)", + re.MULTILINE | re.DOTALL, +) + + +class Converter: + def __init__(self, args): + logging.basicConfig(format="[%(levelname)s] %(message)s") + if args.verbose: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + self.main_url = args.url + # ensure trailing slash is added if missing + parts = urlsplit(self.main_url) + if parts.path == "": + parts = list(parts) + # set path + parts[2] = "/" + self.main_url = urlunsplit(parts) + + self.name = args.name + self.title = args.title + self.favicon_url = args.favicon + self.language = args.lang + self.description = args.description + self.long_description = args.long_description + self.creator_metadata = args.creator + self.publisher = args.publisher + self.tags = DEFAULT_TAGS + (args.tags or []) + self.source = args.source or self.main_url + self.scraper = "warc2zim " + get_version() + self.illustration = b"" + + self.output = args.output + self.zim_file = args.zim_file + + if not self.zim_file: + self.zim_file = "{name}_{period}.zim".format( + name=self.name, period="{period}" + ) + self.zim_file = self.zim_file.format(period=time.strftime("%Y-%m")) + self.full_filename = os.path.join(self.output, self.zim_file) + + # ensure output file is writable + with tempfile.NamedTemporaryFile(dir=self.output, delete=True) as fh: + logger.debug(f"Confirming output is writable using {fh.name}") + + self.inputs = args.inputs + self.include_domains = args.include_domains + + self.replay_viewer_source = args.replay_viewer_source + self.custom_css = args.custom_css + + self.indexed_urls = set({}) + self.revisits = {} + + # progress file handling + self.stats_filename = ( + pathlib.Path(args.progress_file) if args.progress_file else None + ) + if self.stats_filename and not self.stats_filename.is_absolute(): + self.stats_filename = self.output / self.stats_filename + + self.written_records = self.total_records = 0 + + def add_replayer(self): + if self.replay_viewer_source and re.match( + r"^https?\:", self.replay_viewer_source + ): + self.creator.add_item( + URLItem( + url=self.replay_viewer_source + SW_JS, + path="A/" + SW_JS, + mimetype="application/javascript", + ) + ) + elif self.replay_viewer_source: + self.creator.add_item_for( + fpath=self.replay_viewer_source + SW_JS, + path="A/" + SW_JS, + mimetype="application/javascript", + ) + else: + self.creator.add_item( + StaticArticle( + self.env, SW_JS, self.main_url, mimetype="application/javascript" + ) + ) + + def init_env(self): + # autoescape=False to allow injecting html entities from translated text + env = Environment( + loader=PackageLoader("warc2zim", "templates"), + extensions=["jinja2.ext.i18n"], + autoescape=False, + ) + + try: + env.install_gettext_translations(Locale.translation) + except OSError: + logger.warning( + "No translations table found for language: {0}".format(self.language) + ) + env.install_null_translations() + + return env + + def update_stats(self): + """write progress as JSON to self.stats_filename if requested""" + if not self.stats_filename: + return + self.written_records += 1 + with open(self.stats_filename, "w") as fh: + json.dump( + {"written": self.written_records, "total": self.total_records}, fh + ) + + def get_custom_css_record(self): + if re.match(r"^https?\://", self.custom_css): + resp = requests.get(self.custom_css, timeout=10) + resp.raise_for_status() + payload = resp.content + else: + css_path = pathlib.Path(self.custom_css).expanduser().resolve() + with open(css_path, "rb") as fh: + payload = fh.read() + + http_headers = StatusAndHeaders( + "200 OK", + [("Content-Type", 'text/css; charset="UTF-8"')], + protocol="HTTP/1.0", + ) + + return RecordBuilder().create_warc_record( + CUSTOM_CSS_URL, + "response", + payload=io.BytesIO(payload), + length=len(payload), + http_headers=http_headers, + ) + + def run(self): + if not self.inputs: + logger.info( + "Arguments valid, no inputs to process. Exiting with error code 100" + ) + return 100 + + self.find_main_page_metadata() + self.title = self.title or "Untitled" + if len(self.title) > 30: + self.title = f"{self.title[0:29]}…" + self.retrieve_illustration() + self.convert_illustration() + + # make sure Language metadata is ISO-639-3 and setup translations + try: + lang_data = get_language_details(self.language) + self.language = lang_data["iso-639-3"] + except Exception: + logger.error(f"Invalid language setting `{self.language}`. Using `eng`.") + self.language = "eng" + + # try to set locale to language. Might fail (missing locale) + try: + setlocale(pathlib.Path(__file__).parent, lang_data.get("iso-639-1")) + except Exception: + ... + + self.env = self.init_env() + + # init head insert + template = self.env.get_template(HEAD_INSERT_FILE) + self.head_insert = ("" + template.render()).encode("utf-8") + if self.custom_css: + self.css_insert = ( + f'\n\n' + ).encode("utf-8") + else: + self.css_insert = None + + self.creator = Creator( + self.full_filename, + main_path="A/index.html", + ) + + self.creator.config_metadata( + Name=self.name, + Language=self.language or "eng", + Title=self.title, + Description=self.description, + LongDescription=self.long_description, + Creator=self.creator_metadata, + Publisher=self.publisher, + Date=datetime.date.today(), + Illustration_48x48_at_1=self.illustration, + Tags=";".join(self.tags), + Source=self.source, + Scraper=f"warc2zim {get_version()}", + ).start() + + self.add_replayer() + + for filename in pkg_resources.resource_listdir("warc2zim", "templates"): + if filename == HEAD_INSERT_FILE or filename == SW_JS: + continue + + self.creator.add_item(StaticArticle(self.env, filename, self.main_url)) + + for record in self.iter_all_warc_records(): + self.add_items_for_warc_record(record) + + # process revisits, headers only + for url, record in self.revisits.items(): + if canonicalize(url) not in self.indexed_urls: + logger.debug( + "Adding revisit {0} -> {1}".format( + url, record.rec_headers["WARC-Refers-To-Target-URI"] + ) + ) + try: + self.creator.add_item(WARCHeadersItem(record)) + except RuntimeError as exc: + if not DUPLICATE_EXC_STR.match(str(exc)): + raise exc + self.indexed_urls.add(canonicalize(url)) + + logger.debug(f"Found {self.total_records} records in WARCs") + + self.creator.finish() + + def iter_all_warc_records(self): + # add custom css records + if self.custom_css: + yield self.get_custom_css_record() + + yield from iter_warc_records(self.inputs) + + def find_main_page_metadata(self): + for record in self.iter_all_warc_records(): + if record.rec_type == "revisit": + continue + + # if no main_url, use first 'text/html' record as the main page by default + # not guaranteed to always work + mime = get_record_mime_type(record) + + url = record.rec_headers["WARC-Target-URI"] + + if ( + not self.main_url + and mime == "text/html" + and record.payload_length != 0 + and ( + not record.http_headers + or record.http_headers.get_statuscode() == "200" + ) + ): + self.main_url = url + + if urldefrag(self.main_url).url != url: + continue + + # if we get here, found record for the main page + + # if main page is not html, still allow (eg. could be text, img), + # but print warning + if mime not in HTML_TYPES: + logger.warning( + "Main page is not an HTML Page, mime type is: {0} " + "- Skipping Favicon and Language detection".format(mime) + ) + return + + record.buffered_stream.seek(0) + content = record.buffered_stream.read() + + if not self.title: + self.title = parse_title(content) + + self.find_icon_and_language(content) + + logger.debug("Title: {0}".format(self.title)) + logger.debug("Language: {0}".format(self.language)) + logger.debug("Favicon: {0}".format(self.favicon_url)) + return + + raise KeyError( + f"Unable to find WARC record for main page: {self.main_url}, aborting" + ) + + def find_icon_and_language(self, content): + soup = BeautifulSoup(content, "html.parser") + + if not self.favicon_url: + # find icon + icon = soup.find("link", rel="shortcut icon") + if not icon: + icon = soup.find("link", rel="icon") + + if icon and icon.attrs.get("href"): + self.favicon_url = urljoin(self.main_url, icon.attrs["href"]) + else: + self.favicon_url = urljoin(self.main_url, "/favicon.ico") + + if not self.language: + # HTML5 Standard + lang_elem = soup.find("html", attrs={"lang": True}) + if lang_elem: + self.language = lang_elem.attrs["lang"] + return + + # W3C recommendation + lang_elem = soup.find( + "meta", {"http-equiv": "content-language", "content": True} + ) + if lang_elem: + self.language = lang_elem.attrs["content"] + return + + # SEO Recommendations + lang_elem = soup.find("meta", {"name": "language", "content": True}) + if lang_elem: + self.language = lang_elem.attrs["content"] + return + + def retrieve_illustration(self): + """sets self.illustration from self.favicon_url either from WARC or download + + Uses fallback in case of errors/missing""" + if not self.favicon_url: + self.favicon_url = "fallback.png" + self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"] + return + # look into WARC records first + for record in self.iter_all_warc_records(): + url = get_record_url(record) + if not url or record.rec_type == "revisit": + continue + if url == self.favicon_url: + logger.debug(f"Found WARC record for favicon: {self.favicon_url}") + if record and record.http_headers.get_statuscode() != "200": + logger.warning("WARC record for favicon is unuable. Skipping") + self.favicon_url = "fallback.png" + self.illustration = DEFAULT_DEV_ZIM_METADATA[ + "Illustration_48x48_at_1" + ] + return + if hasattr(record, "buffered_stream"): + record.buffered_stream.seek(0) + self.illustration = record.buffered_stream.read() + else: + self.illustration = record.content_stream().read() + return + + # favicon_url not in WARC ; downloading + try: + dst = io.BytesIO() + if not stream_file(self.favicon_url, byte_stream=dst)[0]: + raise IOError("No bytes received downloading favicon") + self.illustration = dst.getvalue() + except Exception as exc: + logger.warning(f"Unable to retrieve favicon. Using fallback: {exc}") + self.favicon_url = "fallback.png" + self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"] + return + + def convert_illustration(self): + """convert self.illustration into a 48x48px PNG with fallback""" + src = io.BytesIO(self.illustration) + dst = io.BytesIO() + try: + convert_image(src, dst, fmt="PNG") + resize_image(dst, width=48, height=48, method="cover") + except Exception as exc: + logger.warning(f"Failed to convert or resize favicon: {exc}") + self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"] + else: + self.illustration = dst.getvalue() + + def is_self_redirect(self, record, url): + if record.rec_type != "response": + return False + + if ( + not record.http_headers.get_statuscode().startswith("3") + or record.http_headers.get_statuscode() == "300" + ): + return False + + location = record.http_headers.get("Location", "") + return canonicalize(url) == canonicalize(location) + + def add_items_for_warc_record(self, record): + url = get_record_url(record) + if not url: + logger.debug(f"Skipping record with empty WARC-Target-URI {record}") + return + + if canonicalize(url) in self.indexed_urls: + logger.debug("Skipping duplicate {0}, already added to ZIM".format(url)) + return + + # if include_domains is set, only include urls from those domains + if self.include_domains: + parts = urlsplit(url) + if not any( + parts.netloc.endswith(domain) for domain in self.include_domains + ): + logger.debug("Skipping url {0}, outside included domains".format(url)) + return + + if record.rec_type != "revisit": + if self.is_self_redirect(record, url): + logger.debug("Skipping self-redirect: " + url) + return + + try: + self.creator.add_item(WARCHeadersItem(record)) + except RuntimeError as exc: + if not DUPLICATE_EXC_STR.match(str(exc)): + raise exc + + payload_item = WARCPayloadItem(record, self.head_insert, self.css_insert) + + if len(payload_item.content) != 0: + try: + self.creator.add_item(payload_item) + except RuntimeError as exc: + if not DUPLICATE_EXC_STR.match(str(exc)): + raise exc + self.total_records += 1 + self.update_stats() + + self.indexed_urls.add(canonicalize(url)) + + elif ( + record.rec_headers["WARC-Refers-To-Target-URI"] != url + and url not in self.revisits + ): + self.revisits[url] = record + + self.add_fuzzy_match_record(url) + + def add_fuzzy_match_record(self, url): + fuzzy_url = url + for rule in FUZZY_RULES: + fuzzy_url = rule["match"].sub(rule["replace"], url) + if fuzzy_url != url: + break + + if fuzzy_url == url: + return + + http_headers = StatusAndHeaders("302 Redirect", {"Location": url}) + + date = datetime.datetime.utcnow().isoformat() + builder = RecordBuilder() + record = builder.create_revisit_record( + fuzzy_url, "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", url, date, http_headers + ) + + self.revisits[fuzzy_url] = record + logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url)) + + +def iter_warc_records(inputs): + """iter warc records, including appending request data to matching response""" + for filename in iter_file_or_dir(inputs): + with open(filename, "rb") as fh: + for record in buffering_record_iter(ArchiveIterator(fh), post_append=True): + if record.rec_type in ("resource", "response", "revisit"): + yield record + + +def get_version(): + return pkg_resources.get_distribution("warc2zim").version diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 623f87d4..2f127cbd 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -20,547 +20,16 @@ """ -import os import sys -import json -import pathlib import logging -import tempfile -import datetime -import re -import io -import time from argparse import ArgumentParser -from urllib.parse import urlsplit, urljoin, urlunsplit, urldefrag -import pkg_resources -import requests -from warcio import ArchiveIterator, StatusAndHeaders -from warcio.recordbuilder import RecordBuilder -from zimscraperlib.constants import DEFAULT_DEV_ZIM_METADATA -from zimscraperlib.download import stream_file -from zimscraperlib.i18n import setlocale, get_language_details, Locale -from zimscraperlib.image.convertion import convert_image -from zimscraperlib.image.transformation import resize_image -from zimscraperlib.zim.creator import Creator -from zimscraperlib.zim.items import URLItem -from bs4 import BeautifulSoup - -from jinja2 import Environment, PackageLoader - -from cdxj_indexer import iter_file_or_dir, buffering_record_iter - -from warc2zim.url_rewriting import FUZZY_RULES, canonicalize -from warc2zim.items import ( - WARCHeadersItem, - WARCPayloadItem, - StaticArticle, - get_record_url, - get_record_mime_type, - parse_title, -) +from .converter import Converter, get_version # Shared logger logger = logging.getLogger("warc2zim") -# HTML mime types -HTML_TYPES = ("text/html", "application/xhtml", "application/xhtml+xml") - -# external sw.js filename -SW_JS = "sw.js" - -# head insert template -HEAD_INSERT_FILE = "sw_check.html" - -# Default ZIM metadata tags -DEFAULT_TAGS = ["_ftindex:yes", "_category:other", "_sw:yes"] - -CUSTOM_CSS_URL = "https://warc2zim.kiwix.app/custom.css" - -DUPLICATE_EXC_STR = re.compile( - r"^Impossible to add(.+)" - r"dirent\'s title to add is(.+)" - r"existing dirent's title is(.+)", - re.MULTILINE | re.DOTALL, -) - - -# ============================================================================ -class WARC2Zim: - def __init__(self, args): - logging.basicConfig(format="[%(levelname)s] %(message)s") - if args.verbose: - logger.setLevel(logging.DEBUG) - else: - logger.setLevel(logging.INFO) - - self.main_url = args.url - # ensure trailing slash is added if missing - parts = urlsplit(self.main_url) - if parts.path == "": - parts = list(parts) - # set path - parts[2] = "/" - self.main_url = urlunsplit(parts) - - self.name = args.name - self.title = args.title - self.favicon_url = args.favicon - self.language = args.lang - self.description = args.description - self.long_description = args.long_description - self.creator_metadata = args.creator - self.publisher = args.publisher - self.tags = DEFAULT_TAGS + (args.tags or []) - self.source = args.source or self.main_url - self.scraper = "warc2zim " + get_version() - self.illustration = b"" - - self.output = args.output - self.zim_file = args.zim_file - - if not self.zim_file: - self.zim_file = "{name}_{period}.zim".format( - name=self.name, period="{period}" - ) - self.zim_file = self.zim_file.format(period=time.strftime("%Y-%m")) - self.full_filename = os.path.join(self.output, self.zim_file) - - # ensure output file is writable - with tempfile.NamedTemporaryFile(dir=self.output, delete=True) as fh: - logger.debug(f"Confirming output is writable using {fh.name}") - - self.inputs = args.inputs - self.include_domains = args.include_domains - - self.replay_viewer_source = args.replay_viewer_source - self.custom_css = args.custom_css - - self.indexed_urls = set({}) - self.revisits = {} - - # progress file handling - self.stats_filename = ( - pathlib.Path(args.progress_file) if args.progress_file else None - ) - if self.stats_filename and not self.stats_filename.is_absolute(): - self.stats_filename = self.output / self.stats_filename - - self.written_records = self.total_records = 0 - - def add_replayer(self): - if self.replay_viewer_source and re.match( - r"^https?\:", self.replay_viewer_source - ): - self.creator.add_item( - URLItem( - url=self.replay_viewer_source + SW_JS, - path="A/" + SW_JS, - mimetype="application/javascript", - ) - ) - elif self.replay_viewer_source: - self.creator.add_item_for( - fpath=self.replay_viewer_source + SW_JS, - path="A/" + SW_JS, - mimetype="application/javascript", - ) - else: - self.creator.add_item( - StaticArticle( - self.env, SW_JS, self.main_url, mimetype="application/javascript" - ) - ) - - def init_env(self): - # autoescape=False to allow injecting html entities from translated text - env = Environment( - loader=PackageLoader("warc2zim", "templates"), - extensions=["jinja2.ext.i18n"], - autoescape=False, - ) - - try: - env.install_gettext_translations(Locale.translation) - except OSError: - logger.warning( - "No translations table found for language: {0}".format(self.language) - ) - env.install_null_translations() - - return env - - def update_stats(self): - """write progress as JSON to self.stats_filename if requested""" - if not self.stats_filename: - return - self.written_records += 1 - with open(self.stats_filename, "w") as fh: - json.dump( - {"written": self.written_records, "total": self.total_records}, fh - ) - - def get_custom_css_record(self): - if re.match(r"^https?\://", self.custom_css): - resp = requests.get(self.custom_css, timeout=10) - resp.raise_for_status() - payload = resp.content - else: - css_path = pathlib.Path(self.custom_css).expanduser().resolve() - with open(css_path, "rb") as fh: - payload = fh.read() - - http_headers = StatusAndHeaders( - "200 OK", - [("Content-Type", 'text/css; charset="UTF-8"')], - protocol="HTTP/1.0", - ) - - return RecordBuilder().create_warc_record( - CUSTOM_CSS_URL, - "response", - payload=io.BytesIO(payload), - length=len(payload), - http_headers=http_headers, - ) - - def run(self): - if not self.inputs: - logger.info( - "Arguments valid, no inputs to process. Exiting with error code 100" - ) - return 100 - - self.find_main_page_metadata() - self.title = self.title or "Untitled" - if len(self.title) > 30: - self.title = f"{self.title[0:29]}…" - self.retrieve_illustration() - self.convert_illustration() - - # make sure Language metadata is ISO-639-3 and setup translations - try: - lang_data = get_language_details(self.language) - self.language = lang_data["iso-639-3"] - except Exception: - logger.error(f"Invalid language setting `{self.language}`. Using `eng`.") - self.language = "eng" - - # try to set locale to language. Might fail (missing locale) - try: - setlocale(pathlib.Path(__file__).parent, lang_data.get("iso-639-1")) - except Exception: - ... - - self.env = self.init_env() - - # init head insert - template = self.env.get_template(HEAD_INSERT_FILE) - self.head_insert = ("" + template.render()).encode("utf-8") - if self.custom_css: - self.css_insert = ( - f'\n\n' - ).encode("utf-8") - else: - self.css_insert = None - - self.creator = Creator( - self.full_filename, - main_path="A/index.html", - ) - - self.creator.config_metadata( - Name=self.name, - Language=self.language or "eng", - Title=self.title, - Description=self.description, - LongDescription=self.long_description, - Creator=self.creator_metadata, - Publisher=self.publisher, - Date=datetime.date.today(), - Illustration_48x48_at_1=self.illustration, - Tags=";".join(self.tags), - Source=self.source, - Scraper=f"warc2zim {get_version()}", - ).start() - - self.add_replayer() - - for filename in pkg_resources.resource_listdir("warc2zim", "templates"): - if filename == HEAD_INSERT_FILE or filename == SW_JS: - continue - - self.creator.add_item(StaticArticle(self.env, filename, self.main_url)) - - for record in self.iter_all_warc_records(): - self.add_items_for_warc_record(record) - - # process revisits, headers only - for url, record in self.revisits.items(): - if canonicalize(url) not in self.indexed_urls: - logger.debug( - "Adding revisit {0} -> {1}".format( - url, record.rec_headers["WARC-Refers-To-Target-URI"] - ) - ) - try: - self.creator.add_item(WARCHeadersItem(record)) - except RuntimeError as exc: - if not DUPLICATE_EXC_STR.match(str(exc)): - raise exc - self.indexed_urls.add(canonicalize(url)) - - logger.debug(f"Found {self.total_records} records in WARCs") - - self.creator.finish() - - def iter_all_warc_records(self): - # add custom css records - if self.custom_css: - yield self.get_custom_css_record() - - yield from iter_warc_records(self.inputs) - def find_main_page_metadata(self): - for record in self.iter_all_warc_records(): - if record.rec_type == "revisit": - continue - - # if no main_url, use first 'text/html' record as the main page by default - # not guaranteed to always work - mime = get_record_mime_type(record) - - url = record.rec_headers["WARC-Target-URI"] - - if ( - not self.main_url - and mime == "text/html" - and record.payload_length != 0 - and ( - not record.http_headers - or record.http_headers.get_statuscode() == "200" - ) - ): - self.main_url = url - - if urldefrag(self.main_url).url != url: - continue - - # if we get here, found record for the main page - - # if main page is not html, still allow (eg. could be text, img), - # but print warning - if mime not in HTML_TYPES: - logger.warning( - "Main page is not an HTML Page, mime type is: {0} " - "- Skipping Favicon and Language detection".format(mime) - ) - return - - record.buffered_stream.seek(0) - content = record.buffered_stream.read() - - if not self.title: - self.title = parse_title(content) - - self.find_icon_and_language(content) - - logger.debug("Title: {0}".format(self.title)) - logger.debug("Language: {0}".format(self.language)) - logger.debug("Favicon: {0}".format(self.favicon_url)) - return - - raise KeyError( - f"Unable to find WARC record for main page: {self.main_url}, aborting" - ) - - def find_icon_and_language(self, content): - soup = BeautifulSoup(content, "html.parser") - - if not self.favicon_url: - # find icon - icon = soup.find("link", rel="shortcut icon") - if not icon: - icon = soup.find("link", rel="icon") - - if icon and icon.attrs.get("href"): - self.favicon_url = urljoin(self.main_url, icon.attrs["href"]) - else: - self.favicon_url = urljoin(self.main_url, "/favicon.ico") - - if not self.language: - # HTML5 Standard - lang_elem = soup.find("html", attrs={"lang": True}) - if lang_elem: - self.language = lang_elem.attrs["lang"] - return - - # W3C recommendation - lang_elem = soup.find( - "meta", {"http-equiv": "content-language", "content": True} - ) - if lang_elem: - self.language = lang_elem.attrs["content"] - return - - # SEO Recommendations - lang_elem = soup.find("meta", {"name": "language", "content": True}) - if lang_elem: - self.language = lang_elem.attrs["content"] - return - - def retrieve_illustration(self): - """sets self.illustration from self.favicon_url either from WARC or download - - Uses fallback in case of errors/missing""" - if not self.favicon_url: - self.favicon_url = "fallback.png" - self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"] - return - # look into WARC records first - for record in self.iter_all_warc_records(): - url = get_record_url(record) - if not url or record.rec_type == "revisit": - continue - if url == self.favicon_url: - logger.debug(f"Found WARC record for favicon: {self.favicon_url}") - if record and record.http_headers.get_statuscode() != "200": - logger.warning("WARC record for favicon is unuable. Skipping") - self.favicon_url = "fallback.png" - self.illustration = DEFAULT_DEV_ZIM_METADATA[ - "Illustration_48x48_at_1" - ] - return - if hasattr(record, "buffered_stream"): - record.buffered_stream.seek(0) - self.illustration = record.buffered_stream.read() - else: - self.illustration = record.content_stream().read() - return - - # favicon_url not in WARC ; downloading - try: - dst = io.BytesIO() - if not stream_file(self.favicon_url, byte_stream=dst)[0]: - raise IOError("No bytes received downloading favicon") - self.illustration = dst.getvalue() - except Exception as exc: - logger.warning(f"Unable to retrieve favicon. Using fallback: {exc}") - self.favicon_url = "fallback.png" - self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"] - return - - def convert_illustration(self): - """convert self.illustration into a 48x48px PNG with fallback""" - src = io.BytesIO(self.illustration) - dst = io.BytesIO() - try: - convert_image(src, dst, fmt="PNG") - resize_image(dst, width=48, height=48, method="cover") - except Exception as exc: - logger.warning(f"Failed to convert or resize favicon: {exc}") - self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"] - else: - self.illustration = dst.getvalue() - - def is_self_redirect(self, record, url): - if record.rec_type != "response": - return False - - if ( - not record.http_headers.get_statuscode().startswith("3") - or record.http_headers.get_statuscode() == "300" - ): - return False - - location = record.http_headers.get("Location", "") - return canonicalize(url) == canonicalize(location) - - def add_items_for_warc_record(self, record): - url = get_record_url(record) - if not url: - logger.debug(f"Skipping record with empty WARC-Target-URI {record}") - return - - if canonicalize(url) in self.indexed_urls: - logger.debug("Skipping duplicate {0}, already added to ZIM".format(url)) - return - - # if include_domains is set, only include urls from those domains - if self.include_domains: - parts = urlsplit(url) - if not any( - parts.netloc.endswith(domain) for domain in self.include_domains - ): - logger.debug("Skipping url {0}, outside included domains".format(url)) - return - - if record.rec_type != "revisit": - if self.is_self_redirect(record, url): - logger.debug("Skipping self-redirect: " + url) - return - - try: - self.creator.add_item(WARCHeadersItem(record)) - except RuntimeError as exc: - if not DUPLICATE_EXC_STR.match(str(exc)): - raise exc - - payload_item = WARCPayloadItem(record, self.head_insert, self.css_insert) - - if len(payload_item.content) != 0: - try: - self.creator.add_item(payload_item) - except RuntimeError as exc: - if not DUPLICATE_EXC_STR.match(str(exc)): - raise exc - self.total_records += 1 - self.update_stats() - - self.indexed_urls.add(canonicalize(url)) - - elif ( - record.rec_headers["WARC-Refers-To-Target-URI"] != url - and url not in self.revisits - ): - self.revisits[url] = record - - self.add_fuzzy_match_record(url) - - def add_fuzzy_match_record(self, url): - fuzzy_url = url - for rule in FUZZY_RULES: - fuzzy_url = rule["match"].sub(rule["replace"], url) - if fuzzy_url != url: - break - - if fuzzy_url == url: - return - - http_headers = StatusAndHeaders("302 Redirect", {"Location": url}) - - date = datetime.datetime.utcnow().isoformat() - builder = RecordBuilder() - record = builder.create_revisit_record( - fuzzy_url, "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", url, date, http_headers - ) - - self.revisits[fuzzy_url] = record - logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url)) - - -# ============================================================================ -def iter_warc_records(inputs): - """iter warc records, including appending request data to matching response""" - for filename in iter_file_or_dir(inputs): - with open(filename, "rb") as fh: - for record in buffering_record_iter(ArchiveIterator(fh), post_append=True): - if record.rec_type in ("resource", "response", "revisit"): - yield record - - -# ============================================================================ def warc2zim(args=None): parser = ArgumentParser(description="Create ZIM files from WARC files") @@ -639,15 +108,9 @@ def warc2zim(args=None): ) r = parser.parse_args(args=args) - warc2zim = WARC2Zim(r) - return warc2zim.run() - - -# ============================================================================ -def get_version(): - return pkg_resources.get_distribution("warc2zim").version + converter = Converter(r) + return converter.run() -# ============================================================================ if __name__ == "__main__": # pragma: no cover sys.exit(warc2zim()) diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 90a03c27..fde6f7a6 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -14,12 +14,13 @@ from jinja2 import Environment, PackageLoader from zimscraperlib.zim import Archive -from warc2zim.main import ( - warc2zim, - canonicalize, - iter_warc_records, - get_record_url, -) +from warc2zim.url_rewriting import canonicalize +from warc2zim.converter import iter_warc_records +from warc2zim.items import get_record_url + +# Import last to not mask warc2zim module +from warc2zim.main import warc2zim + TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") From fee8d7edd59026b79ecfeeb61b1a65aca7e58d88 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 28 Nov 2023 15:21:59 +0100 Subject: [PATCH 06/10] fixup! Move `WARC2Zim` class to `converter.py` --- src/warc2zim/converter.py | 4 ++-- src/warc2zim/main.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 66bb5141..468fdef8 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -49,8 +49,8 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter -from .url_rewriting import FUZZY_RULES, canonicalize -from .items import ( +from warc2zim.url_rewriting import FUZZY_RULES, canonicalize +from warc2zim.items import ( WARCHeadersItem, WARCPayloadItem, StaticArticle, diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 2f127cbd..aad176fd 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -24,7 +24,7 @@ import logging from argparse import ArgumentParser -from .converter import Converter, get_version +from warc2zim.converter import Converter, get_version # Shared logger logger = logging.getLogger("warc2zim") From 8e5f5b22bb9de829fae80e2de11b3f665b189acd Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 15:00:34 +0100 Subject: [PATCH 07/10] Introduce utils.py module to store small helpers. --- src/warc2zim/converter.py | 14 ++------------ src/warc2zim/items.py | 30 +----------------------------- src/warc2zim/utils.py | 39 +++++++++++++++++++++++++++++++++++++++ tests/test_warc_to_zim.py | 6 ++---- 4 files changed, 44 insertions(+), 45 deletions(-) create mode 100644 src/warc2zim/utils.py diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 468fdef8..abec7010 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -50,14 +50,8 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter from warc2zim.url_rewriting import FUZZY_RULES, canonicalize -from warc2zim.items import ( - WARCHeadersItem, - WARCPayloadItem, - StaticArticle, - get_record_url, - get_record_mime_type, - parse_title, -) +from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle +from .utils import get_version, get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim.converter") @@ -555,7 +549,3 @@ def iter_warc_records(inputs): for record in buffering_record_iter(ArchiveIterator(fh), post_append=True): if record.rec_type in ("resource", "response", "revisit"): yield record - - -def get_version(): - return pkg_resources.get_distribution("warc2zim").version diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index 726bd956..84dfbbd8 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -19,6 +19,7 @@ from bs4 import BeautifulSoup from warc2zim.url_rewriting import canonicalize +from .utils import get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim.items") @@ -123,32 +124,3 @@ def get_mimetype(self): def get_hints(self): return {Hint.FRONT_ARTICLE: False} - - -def get_record_url(record): - """Check if record has url converted from POST/PUT, and if so, use that - otherwise return the target url""" - if hasattr(record, "urlkey"): - return record.urlkey - return record.rec_headers["WARC-Target-URI"] - - -def get_record_mime_type(record): - if record.http_headers: - # if the record has HTTP headers, use the Content-Type from those - # (eg. 'response' record) - content_type = record.http_headers["Content-Type"] - else: - # otherwise, use the Content-Type from WARC headers - content_type = record.rec_headers["Content-Type"] - - mime = content_type or "" - return mime.split(";")[0] - - -def parse_title(content): - try: - soup = BeautifulSoup(content, "html.parser") - return soup.title.text or "" - except Exception: - return "" diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py new file mode 100644 index 00000000..9e55b8ad --- /dev/null +++ b/src/warc2zim/utils.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ai ts=4 sts=4 et sw=4 nu + +import pkg_resources +from bs4 import BeautifulSoup + + +def get_version(): + return pkg_resources.get_distribution("warc2zim").version + + +def get_record_url(record): + """Check if record has url converted from POST/PUT, and if so, use that + otherwise return the target url""" + if hasattr(record, "urlkey"): + return record.urlkey + return record.rec_headers["WARC-Target-URI"] + + +def get_record_mime_type(record): + if record.http_headers: + # if the record has HTTP headers, use the Content-Type from those + # (eg. 'response' record) + content_type = record.http_headers["Content-Type"] + else: + # otherwise, use the Content-Type from WARC headers + content_type = record.rec_headers["Content-Type"] + + mime = content_type or "" + return mime.split(";")[0] + + +def parse_title(content): + try: + soup = BeautifulSoup(content, "html.parser") + return soup.title.text or "" + except Exception: + return "" diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index fde6f7a6..e34fa2a0 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -16,13 +16,11 @@ from warc2zim.url_rewriting import canonicalize from warc2zim.converter import iter_warc_records -from warc2zim.items import get_record_url +from warc2zim.utils import get_record_url -# Import last to not mask warc2zim module +# Import last to not mask the warc2zim module from warc2zim.main import warc2zim - - TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") From 319a769bba89d3b6f1a915873b72b54b43c8a462 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 28 Nov 2023 15:23:44 +0100 Subject: [PATCH 08/10] fixup! Introduce utils.py module to store small helpers. --- src/warc2zim/converter.py | 7 ++++++- src/warc2zim/items.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index abec7010..a7387b51 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -51,7 +51,12 @@ from warc2zim.url_rewriting import FUZZY_RULES, canonicalize from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle -from .utils import get_version, get_record_url, get_record_mime_type, parse_title +from warc2zim.utils import ( + get_version, + get_record_url, + get_record_mime_type, + parse_title, +) # Shared logger logger = logging.getLogger("warc2zim.converter") diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index 84dfbbd8..a6587e06 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -19,7 +19,7 @@ from bs4 import BeautifulSoup from warc2zim.url_rewriting import canonicalize -from .utils import get_record_url, get_record_mime_type, parse_title +from warc2zim.utils import get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim.items") From ec074634595712a447a588a5df2239f1067bb4f8 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 28 Nov 2023 15:24:54 +0100 Subject: [PATCH 09/10] fixup! Introduce utils.py module to store small helpers. --- src/warc2zim/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index aad176fd..f1ecae21 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -24,7 +24,8 @@ import logging from argparse import ArgumentParser -from warc2zim.converter import Converter, get_version +from warc2zim.converter import Converter +from warc2zim.utils import get_version # Shared logger logger = logging.getLogger("warc2zim") From e10c04b8c3f7fe840a6114121fe79325dc57fb63 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 28 Nov 2023 15:27:01 +0100 Subject: [PATCH 10/10] Rename `warc2zim` to `main`. --- src/warc2zim/main.py | 4 ++-- tests/test_warc_to_zim.py | 34 ++++++++++++++++------------------ 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index f1ecae21..45bb7e9e 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -31,7 +31,7 @@ logger = logging.getLogger("warc2zim") -def warc2zim(args=None): +def main(args=None): parser = ArgumentParser(description="Create ZIM files from WARC files") parser.add_argument("-V", "--version", action="version", version=get_version()) @@ -114,4 +114,4 @@ def warc2zim(args=None): if __name__ == "__main__": # pragma: no cover - sys.exit(warc2zim()) + sys.exit(main()) diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index e34fa2a0..d46d9807 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -17,9 +17,7 @@ from warc2zim.url_rewriting import canonicalize from warc2zim.converter import iter_warc_records from warc2zim.utils import get_record_url - -# Import last to not mask the warc2zim module -from warc2zim.main import warc2zim +from warc2zim.main import main TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") @@ -189,7 +187,7 @@ def test_canonicalize(self): def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): zim_output = "zim-out-filename.zim" - warc2zim( + main( [ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), @@ -270,7 +268,7 @@ def test_warc_to_zim(self, cmdline, tmp_path): cmdline.extend(["--output", str(tmp_path), "--name", filename]) - warc2zim(cmdline) + main(cmdline) zimfile = filename + "_" + time.strftime("%Y-%m") + ".zim" @@ -287,7 +285,7 @@ def test_warc_to_zim(self, cmdline, tmp_path): def test_same_domain_only(self, tmp_path): zim_output = "same-domain.zim" - warc2zim( + main( [ os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"), "--favicon", @@ -315,7 +313,7 @@ def test_same_domain_only(self, tmp_path): def test_skip_self_redirect(self, tmp_path): zim_output = "self-redir.zim" - warc2zim( + main( [ os.path.join(TEST_DATA_DIR, "self-redirect.warc"), "--output", @@ -340,7 +338,7 @@ def test_skip_self_redirect(self, tmp_path): def test_include_domains_favicon_and_language(self, tmp_path): zim_output = "spt.zim" - warc2zim( + main( [ os.path.join(TEST_DATA_DIR, "single-page-test.warc"), "-i", @@ -380,7 +378,7 @@ def test_include_domains_favicon_and_language(self, tmp_path): def test_all_warcs_root_dir(self, tmp_path): zim_output = "test-all.zim" - warc2zim( + main( [ os.path.join(TEST_DATA_DIR), "--output", @@ -418,7 +416,7 @@ def test_all_warcs_root_dir(self, tmp_path): def test_fuzzy_urls(self, tmp_path, fuzzycheck): zim_output = fuzzycheck["filename"] + ".zim" - warc2zim( + main( [ os.path.join(TEST_DATA_DIR, fuzzycheck["filename"]), "--output", @@ -445,7 +443,7 @@ def test_local_replay_viewer_url(self, tmp_path): with open(tmp_path / "sw.js", "wt") as fh: fh.write(res.text) - warc2zim( + main( [ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), @@ -465,7 +463,7 @@ def test_local_replay_viewer_url(self, tmp_path): def test_error_bad_replay_viewer_url(self, tmp_path): zim_output_not_created = "zim-out-not-created.zim" with pytest.raises(Exception) as e: - warc2zim( + main( [ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), @@ -486,7 +484,7 @@ def test_error_bad_replay_viewer_url(self, tmp_path): def test_error_bad_main_page(self, tmp_path): zim_output_not_created = "zim-out-not-created.zim" with pytest.raises(Exception) as e: - warc2zim( + main( [ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), @@ -504,15 +502,15 @@ def test_error_bad_main_page(self, tmp_path): def test_args_only(self): # error, name required with pytest.raises(SystemExit) as e: - warc2zim([]) + main([]) assert e.code == 2 # error, no such output directory with pytest.raises(Exception) as e: - warc2zim(["--name", "test", "--output", "/no-such-dir"]) + main(["--name", "test", "--output", "/no-such-dir"]) # success, special error code for no output files - assert warc2zim(["--name", "test", "--output", "./"]) == 100 + assert main(["--name", "test", "--output", "./"]) == 100 def test_custom_css(self, tmp_path): custom_css = b"* { background-color: red; }" @@ -522,7 +520,7 @@ def test_custom_css(self, tmp_path): zim_output = "test-css.zim" - warc2zim( + main( [ os.path.join(TEST_DATA_DIR, "example-response.warc"), "--output", @@ -549,7 +547,7 @@ def test_custom_css_remote(self, tmp_path): "https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/css/bootstrap-reboot.css" ) - warc2zim( + main( [ os.path.join(TEST_DATA_DIR, "example-response.warc"), "--output",