diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 468fdef8..a7387b51 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -50,10 +50,9 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter from warc2zim.url_rewriting import FUZZY_RULES, canonicalize -from warc2zim.items import ( - WARCHeadersItem, - WARCPayloadItem, - StaticArticle, +from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle +from warc2zim.utils import ( + get_version, get_record_url, get_record_mime_type, parse_title, @@ -555,7 +554,3 @@ def iter_warc_records(inputs): for record in buffering_record_iter(ArchiveIterator(fh), post_append=True): if record.rec_type in ("resource", "response", "revisit"): yield record - - -def get_version(): - return pkg_resources.get_distribution("warc2zim").version diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index 726bd956..a6587e06 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -19,6 +19,7 @@ from bs4 import BeautifulSoup from warc2zim.url_rewriting import canonicalize +from warc2zim.utils import get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim.items") @@ -123,32 +124,3 @@ def get_mimetype(self): def get_hints(self): return {Hint.FRONT_ARTICLE: False} - - -def get_record_url(record): - """Check if record has url converted from POST/PUT, and if so, use that - otherwise return the target url""" - if hasattr(record, "urlkey"): - return record.urlkey - return record.rec_headers["WARC-Target-URI"] - - -def get_record_mime_type(record): - if record.http_headers: - # if the record has HTTP headers, use the Content-Type from those - # (eg. 'response' record) - content_type = record.http_headers["Content-Type"] - else: - # otherwise, use the Content-Type from WARC headers - content_type = record.rec_headers["Content-Type"] - - mime = content_type or "" - return mime.split(";")[0] - - -def parse_title(content): - try: - soup = BeautifulSoup(content, "html.parser") - return soup.title.text or "" - except Exception: - return "" diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index aad176fd..f1ecae21 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -24,7 +24,8 @@ import logging from argparse import ArgumentParser -from warc2zim.converter import Converter, get_version +from warc2zim.converter import Converter +from warc2zim.utils import get_version # Shared logger logger = logging.getLogger("warc2zim") diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py new file mode 100644 index 00000000..9e55b8ad --- /dev/null +++ b/src/warc2zim/utils.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ai ts=4 sts=4 et sw=4 nu + +import pkg_resources +from bs4 import BeautifulSoup + + +def get_version(): + return pkg_resources.get_distribution("warc2zim").version + + +def get_record_url(record): + """Check if record has url converted from POST/PUT, and if so, use that + otherwise return the target url""" + if hasattr(record, "urlkey"): + return record.urlkey + return record.rec_headers["WARC-Target-URI"] + + +def get_record_mime_type(record): + if record.http_headers: + # if the record has HTTP headers, use the Content-Type from those + # (eg. 'response' record) + content_type = record.http_headers["Content-Type"] + else: + # otherwise, use the Content-Type from WARC headers + content_type = record.rec_headers["Content-Type"] + + mime = content_type or "" + return mime.split(";")[0] + + +def parse_title(content): + try: + soup = BeautifulSoup(content, "html.parser") + return soup.title.text or "" + except Exception: + return "" diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index fde6f7a6..e34fa2a0 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -16,13 +16,11 @@ from warc2zim.url_rewriting import canonicalize from warc2zim.converter import iter_warc_records -from warc2zim.items import get_record_url +from warc2zim.utils import get_record_url -# Import last to not mask warc2zim module +# Import last to not mask the warc2zim module from warc2zim.main import warc2zim - - TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")