From 2a3a03f6a0c8deac1abe0d2bfdd6f757ebbcd51a Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 15:00:34 +0100 Subject: [PATCH] Introduce utils.py module to store small helpers. --- src/warc2zim/converter.py | 14 ++------------ src/warc2zim/items.py | 30 +----------------------------- src/warc2zim/utils.py | 39 +++++++++++++++++++++++++++++++++++++++ tests/test_warc_to_zim.py | 6 ++---- 4 files changed, 44 insertions(+), 45 deletions(-) create mode 100644 src/warc2zim/utils.py diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 66bb5141..6980a85e 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -50,14 +50,8 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter from .url_rewriting import FUZZY_RULES, canonicalize -from .items import ( - WARCHeadersItem, - WARCPayloadItem, - StaticArticle, - get_record_url, - get_record_mime_type, - parse_title, -) +from .items import WARCHeadersItem, WARCPayloadItem, StaticArticle +from .utils import get_version, get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim.converter") @@ -555,7 +549,3 @@ def iter_warc_records(inputs): for record in buffering_record_iter(ArchiveIterator(fh), post_append=True): if record.rec_type in ("resource", "response", "revisit"): yield record - - -def get_version(): - return pkg_resources.get_distribution("warc2zim").version diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index bf888a4b..3ce18031 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -19,6 +19,7 @@ from bs4 import BeautifulSoup from .url_rewriting import canonicalize +from .utils import get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim.items") @@ -123,32 +124,3 @@ def get_mimetype(self): def get_hints(self): return {Hint.FRONT_ARTICLE: False} - - -def get_record_url(record): - """Check if record has url converted from POST/PUT, and if so, use that - otherwise return the target url""" - if hasattr(record, "urlkey"): - return record.urlkey - return record.rec_headers["WARC-Target-URI"] - - -def get_record_mime_type(record): - if record.http_headers: - # if the record has HTTP headers, use the Content-Type from those - # (eg. 'response' record) - content_type = record.http_headers["Content-Type"] - else: - # otherwise, use the Content-Type from WARC headers - content_type = record.rec_headers["Content-Type"] - - mime = content_type or "" - return mime.split(";")[0] - - -def parse_title(content): - try: - soup = BeautifulSoup(content, "html.parser") - return soup.title.text or "" - except Exception: - return "" diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py new file mode 100644 index 00000000..9e55b8ad --- /dev/null +++ b/src/warc2zim/utils.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ai ts=4 sts=4 et sw=4 nu + +import pkg_resources +from bs4 import BeautifulSoup + + +def get_version(): + return pkg_resources.get_distribution("warc2zim").version + + +def get_record_url(record): + """Check if record has url converted from POST/PUT, and if so, use that + otherwise return the target url""" + if hasattr(record, "urlkey"): + return record.urlkey + return record.rec_headers["WARC-Target-URI"] + + +def get_record_mime_type(record): + if record.http_headers: + # if the record has HTTP headers, use the Content-Type from those + # (eg. 'response' record) + content_type = record.http_headers["Content-Type"] + else: + # otherwise, use the Content-Type from WARC headers + content_type = record.rec_headers["Content-Type"] + + mime = content_type or "" + return mime.split(";")[0] + + +def parse_title(content): + try: + soup = BeautifulSoup(content, "html.parser") + return soup.title.text or "" + except Exception: + return "" diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index fde6f7a6..e34fa2a0 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -16,13 +16,11 @@ from warc2zim.url_rewriting import canonicalize from warc2zim.converter import iter_warc_records -from warc2zim.items import get_record_url +from warc2zim.utils import get_record_url -# Import last to not mask warc2zim module +# Import last to not mask the warc2zim module from warc2zim.main import warc2zim - - TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")