diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index a785641f..6980a85e 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -50,7 +50,8 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter from .url_rewriting import FUZZY_RULES, canonicalize -from .items import WARCHeadersItem, WARCPayloadItem, StaticArticle, get_record_url, get_record_mime_type, parse_title +from .items import WARCHeadersItem, WARCPayloadItem, StaticArticle +from .utils import get_version, get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim.converter") @@ -548,7 +549,3 @@ def iter_warc_records(inputs): for record in buffering_record_iter(ArchiveIterator(fh), post_append=True): if record.rec_type in ("resource", "response", "revisit"): yield record - - -def get_version(): - return pkg_resources.get_distribution("warc2zim").version diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index 2484a94a..c51bbc8c 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -17,6 +17,7 @@ from zimscraperlib.zim.providers import StringProvider from .url_rewriting import canonicalize +from .utils import get_record_url, get_record_mime_type, parse_title # Shared logger logger = logging.getLogger("warc2zim.items") @@ -121,32 +122,3 @@ def get_mimetype(self): def get_hints(self): return {Hint.FRONT_ARTICLE: False} - - -def get_record_url(record): - """Check if record has url converted from POST/PUT, and if so, use that - otherwise return the target url""" - if hasattr(record, "urlkey"): - return record.urlkey - return record.rec_headers["WARC-Target-URI"] - - -def get_record_mime_type(record): - if record.http_headers: - # if the record has HTTP headers, use the Content-Type from those - # (eg. 'response' record) - content_type = record.http_headers["Content-Type"] - else: - # otherwise, use the Content-Type from WARC headers - content_type = record.rec_headers["Content-Type"] - - mime = content_type or "" - return mime.split(";")[0] - - -def parse_title(content): - try: - soup = BeautifulSoup(content, "html.parser") - return soup.title.text or "" - except Exception: - return "" diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py new file mode 100644 index 00000000..9e55b8ad --- /dev/null +++ b/src/warc2zim/utils.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ai ts=4 sts=4 et sw=4 nu + +import pkg_resources +from bs4 import BeautifulSoup + + +def get_version(): + return pkg_resources.get_distribution("warc2zim").version + + +def get_record_url(record): + """Check if record has url converted from POST/PUT, and if so, use that + otherwise return the target url""" + if hasattr(record, "urlkey"): + return record.urlkey + return record.rec_headers["WARC-Target-URI"] + + +def get_record_mime_type(record): + if record.http_headers: + # if the record has HTTP headers, use the Content-Type from those + # (eg. 'response' record) + content_type = record.http_headers["Content-Type"] + else: + # otherwise, use the Content-Type from WARC headers + content_type = record.rec_headers["Content-Type"] + + mime = content_type or "" + return mime.split(";")[0] + + +def parse_title(content): + try: + soup = BeautifulSoup(content, "html.parser") + return soup.title.text or "" + except Exception: + return ""