From 9ea4ffd3758ffb78a3497fdb538ab2981c1dbd2e Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 14:43:11 +0100 Subject: [PATCH] Move creator items into `items.py`. --- src/warc2zim/items.py | 152 ++++++++++++++++++++++++++++++++++++++++++ src/warc2zim/main.py | 142 +-------------------------------------- 2 files changed, 154 insertions(+), 140 deletions(-) create mode 100644 src/warc2zim/items.py diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py new file mode 100644 index 00000000..2484a94a --- /dev/null +++ b/src/warc2zim/items.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ai ts=4 sts=4 et sw=4 nu + +""" warc2zim's item classes + +This module contains the differents Item we may want to add to a Zim archive. +""" + +import logging +import re + +import pkg_resources +from libzim.writer import Hint +from zimscraperlib.types import get_mime_for_name +from zimscraperlib.zim.items import StaticItem +from zimscraperlib.zim.providers import StringProvider + +from .url_rewriting import canonicalize + +# Shared logger +logger = logging.getLogger("warc2zim.items") + +# external sw.js filename +SW_JS = "sw.js" + +HEAD_INS = re.compile(b"()", re.I) +CSS_INS = re.compile(b"()", re.I) + + +class WARCHeadersItem(StaticItem): + """WARCHeadersItem used to store the WARC + HTTP headers as text + Usually stored under H namespace + """ + + def __init__(self, record): + super().__init__() + self.record = record + self.url = get_record_url(record) + + def get_path(self): + return "H/" + canonicalize(self.url) + + def get_title(self): + return "" + + def get_mimetype(self): + return "application/warc-headers" + + def get_hints(self): + return {Hint.FRONT_ARTICLE: False} + + def get_contentprovider(self): + # add WARC headers + buff = self.record.rec_headers.to_bytes(encoding="utf-8") + # add HTTP headers, if present + if self.record.http_headers: + buff += self.record.http_headers.to_bytes(encoding="utf-8") + + return StringProvider(content=buff, ref=self) + + +class WARCPayloadItem(StaticItem): + """WARCPayloadItem used to store the WARC payload + Usually stored under A namespace + """ + + def __init__(self, record, head_insert=None, css_insert=None): + super().__init__() + self.record = record + self.url = get_record_url(record) + self.mimetype = get_record_mime_type(record) + self.title = "" + + if hasattr(self.record, "buffered_stream"): + self.record.buffered_stream.seek(0) + self.content = self.record.buffered_stream.read() + else: + self.content = self.record.content_stream().read() + + if self.mimetype.startswith("text/html"): + self.title = parse_title(self.content) + if head_insert: + self.content = HEAD_INS.sub(head_insert, self.content) + if css_insert: + self.content = CSS_INS.sub(css_insert, self.content) + + def get_path(self): + return "A/" + canonicalize(self.url) + + def get_title(self): + return self.title + + def get_hints(self): + is_front = self.mimetype.startswith("text/html") + return {Hint.FRONT_ARTICLE: is_front} + + +class StaticArticle(StaticItem): + def __init__(self, env, filename, main_url, **kwargs): + super().__init__(**kwargs) + self.filename = filename + self.main_url = main_url + + self.mime = get_mime_for_name(filename) + self.mime = self.mime or "application/octet-stream" + + if filename != SW_JS: + template = env.get_template(filename) + self.content = template.render(MAIN_URL=self.main_url) + else: + self.content = pkg_resources.resource_string( + "warc2zim", "templates/" + filename + ).decode("utf-8") + + def get_path(self): + return "A/" + self.filename + + def get_mimetype(self): + return self.mime + + def get_hints(self): + return {Hint.FRONT_ARTICLE: False} + + +def get_record_url(record): + """Check if record has url converted from POST/PUT, and if so, use that + otherwise return the target url""" + if hasattr(record, "urlkey"): + return record.urlkey + return record.rec_headers["WARC-Target-URI"] + + +def get_record_mime_type(record): + if record.http_headers: + # if the record has HTTP headers, use the Content-Type from those + # (eg. 'response' record) + content_type = record.http_headers["Content-Type"] + else: + # otherwise, use the Content-Type from WARC headers + content_type = record.rec_headers["Content-Type"] + + mime = content_type or "" + return mime.split(";")[0] + + +def parse_title(content): + try: + soup = BeautifulSoup(content, "html.parser") + return soup.title.text or "" + except Exception: + return "" diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 4a097f9c..e7f91ba8 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -35,18 +35,15 @@ import pkg_resources import requests -from libzim.writer import Hint from warcio import ArchiveIterator, StatusAndHeaders from warcio.recordbuilder import RecordBuilder from zimscraperlib.constants import DEFAULT_DEV_ZIM_METADATA from zimscraperlib.download import stream_file -from zimscraperlib.types import get_mime_for_name from zimscraperlib.i18n import setlocale, get_language_details, Locale from zimscraperlib.image.convertion import convert_image from zimscraperlib.image.transformation import resize_image from zimscraperlib.zim.creator import Creator -from zimscraperlib.zim.items import StaticItem, URLItem -from zimscraperlib.zim.providers import StringProvider +from zimscraperlib.zim.items import URLItem from bs4 import BeautifulSoup from jinja2 import Environment, PackageLoader @@ -54,6 +51,7 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter from .url_rewriting import FUZZY_RULES, canonicalize +from .items import WARCHeadersItem, WARCPayloadItem, StaticArticle, get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim") @@ -67,11 +65,6 @@ # head insert template HEAD_INSERT_FILE = "sw_check.html" - -HEAD_INS = re.compile(b"()", re.I) -CSS_INS = re.compile(b"()", re.I) - - # Default ZIM metadata tags DEFAULT_TAGS = ["_ftindex:yes", "_category:other", "_sw:yes"] @@ -84,105 +77,6 @@ re.MULTILINE | re.DOTALL, ) - -# ============================================================================ -class WARCHeadersItem(StaticItem): - """WARCHeadersItem used to store the WARC + HTTP headers as text - Usually stored under H namespace - """ - - def __init__(self, record): - super().__init__() - self.record = record - self.url = get_record_url(record) - - def get_path(self): - return "H/" + canonicalize(self.url) - - def get_title(self): - return "" - - def get_mimetype(self): - return "application/warc-headers" - - def get_hints(self): - return {Hint.FRONT_ARTICLE: False} - - def get_contentprovider(self): - # add WARC headers - buff = self.record.rec_headers.to_bytes(encoding="utf-8") - # add HTTP headers, if present - if self.record.http_headers: - buff += self.record.http_headers.to_bytes(encoding="utf-8") - - return StringProvider(content=buff, ref=self) - - -# ============================================================================ -class WARCPayloadItem(StaticItem): - """WARCPayloadItem used to store the WARC payload - Usually stored under A namespace - """ - - def __init__(self, record, head_insert=None, css_insert=None): - super().__init__() - self.record = record - self.url = get_record_url(record) - self.mimetype = get_record_mime_type(record) - self.title = "" - - if hasattr(self.record, "buffered_stream"): - self.record.buffered_stream.seek(0) - self.content = self.record.buffered_stream.read() - else: - self.content = self.record.content_stream().read() - - if self.mimetype.startswith("text/html"): - self.title = parse_title(self.content) - if head_insert: - self.content = HEAD_INS.sub(head_insert, self.content) - if css_insert: - self.content = CSS_INS.sub(css_insert, self.content) - - def get_path(self): - return "A/" + canonicalize(self.url) - - def get_title(self): - return self.title - - def get_hints(self): - is_front = self.mimetype.startswith("text/html") - return {Hint.FRONT_ARTICLE: is_front} - - -# ============================================================================ -class StaticArticle(StaticItem): - def __init__(self, env, filename, main_url, **kwargs): - super().__init__(**kwargs) - self.filename = filename - self.main_url = main_url - - self.mime = get_mime_for_name(filename) - self.mime = self.mime or "application/octet-stream" - - if filename != SW_JS: - template = env.get_template(filename) - self.content = template.render(MAIN_URL=self.main_url) - else: - self.content = pkg_resources.resource_string( - "warc2zim", "templates/" + filename - ).decode("utf-8") - - def get_path(self): - return "A/" + self.filename - - def get_mimetype(self): - return self.mime - - def get_hints(self): - return {Hint.FRONT_ARTICLE: False} - - # ============================================================================ class WARC2Zim: def __init__(self, args): @@ -648,38 +542,6 @@ def add_fuzzy_match_record(self, url): logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url)) -# ============================================================================ -def get_record_url(record): - """Check if record has url converted from POST/PUT, and if so, use that - otherwise return the target url""" - if hasattr(record, "urlkey"): - return record.urlkey - return record.rec_headers["WARC-Target-URI"] - - -# ============================================================================ -def get_record_mime_type(record): - if record.http_headers: - # if the record has HTTP headers, use the Content-Type from those - # (eg. 'response' record) - content_type = record.http_headers["Content-Type"] - else: - # otherwise, use the Content-Type from WARC headers - content_type = record.rec_headers["Content-Type"] - - mime = content_type or "" - return mime.split(";")[0] - - -# ============================================================================ -def parse_title(content): - try: - soup = BeautifulSoup(content, "html.parser") - return soup.title.text or "" - except Exception: - return "" - - # ============================================================================ def iter_warc_records(inputs): """iter warc records, including appending request data to matching response"""