Introduce utils.py module to store small helpers.

openzim · Dec 8, 2023 · 57da2b2 · 57da2b2
1 parent e2cd6f4
commit 57da2b2
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 42 deletions.
diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
@@ -50,10 +50,9 @@
 from cdxj_indexer import iter_file_or_dir, buffering_record_iter
 
 from warc2zim.url_rewriting import FUZZY_RULES, canonicalize
-from warc2zim.items import (
-    WARCHeadersItem,
-    WARCPayloadItem,
-    StaticArticle,
+from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle
+from warc2zim.utils import (
+    get_version,
     get_record_url,
     get_record_mime_type,
     parse_title,
@@ -555,7 +554,3 @@ def iter_warc_records(inputs):
             for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
                 if record.rec_type in ("resource", "response", "revisit"):
                     yield record
-
-
-def get_version():
-    return pkg_resources.get_distribution("warc2zim").version
diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
@@ -19,6 +19,7 @@
 from bs4 import BeautifulSoup
 
 from warc2zim.url_rewriting import canonicalize
+from warc2zim.utils import get_record_url, get_record_mime_type, parse_title
 
 # Shared logger
 logger = logging.getLogger("warc2zim.items")
@@ -123,32 +124,3 @@ def get_mimetype(self):
 
     def get_hints(self):
         return {Hint.FRONT_ARTICLE: False}
-
-
-def get_record_url(record):
-    """Check if record has url converted from POST/PUT, and if so, use that
-    otherwise return the target url"""
-    if hasattr(record, "urlkey"):
-        return record.urlkey
-    return record.rec_headers["WARC-Target-URI"]
-
-
-def get_record_mime_type(record):
-    if record.http_headers:
-        # if the record has HTTP headers, use the Content-Type from those
-        # (eg. 'response' record)
-        content_type = record.http_headers["Content-Type"]
-    else:
-        # otherwise, use the Content-Type from WARC headers
-        content_type = record.rec_headers["Content-Type"]
-
-    mime = content_type or ""
-    return mime.split(";")[0]
-
-
-def parse_title(content):
-    try:
-        soup = BeautifulSoup(content, "html.parser")
-        return soup.title.text or ""
-    except Exception:
-        return ""
diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py
@@ -24,7 +24,8 @@
 import logging
 from argparse import ArgumentParser
 
-from warc2zim.converter import Converter, get_version
+from warc2zim.converter import Converter
+from warc2zim.utils import get_version
 
 # Shared logger
 logger = logging.getLogger("warc2zim")

diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim: ai ts=4 sts=4 et sw=4 nu
+
+import pkg_resources
+from bs4 import BeautifulSoup
+
+
+def get_version():
+    return pkg_resources.get_distribution("warc2zim").version
+
+
+def get_record_url(record):
+    """Check if record has url converted from POST/PUT, and if so, use that
+    otherwise return the target url"""
+    if hasattr(record, "urlkey"):
+        return record.urlkey
+    return record.rec_headers["WARC-Target-URI"]
+
+
+def get_record_mime_type(record):
+    if record.http_headers:
+        # if the record has HTTP headers, use the Content-Type from those
+        # (eg. 'response' record)
+        content_type = record.http_headers["Content-Type"]
+    else:
+        # otherwise, use the Content-Type from WARC headers
+        content_type = record.rec_headers["Content-Type"]
+
+    mime = content_type or ""
+    return mime.split(";")[0]
+
+
+def parse_title(content):
+    try:
+        soup = BeautifulSoup(content, "html.parser")
+        return soup.title.text or ""
+    except Exception:
+        return ""
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
@@ -16,13 +16,11 @@
 
 from warc2zim.url_rewriting import canonicalize
 from warc2zim.converter import iter_warc_records
-from warc2zim.items import get_record_url
+from warc2zim.utils import get_record_url
 
-# Import last to not mask warc2zim module
+# Import last to not mask the warc2zim module
 from warc2zim.main import warc2zim
 
-
-
 TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")