Skip to content

Commit

Permalink
Introduce utils.py module to store small helpers.
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Dec 8, 2023
1 parent e2cd6f4 commit 57da2b2
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 42 deletions.
11 changes: 3 additions & 8 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,9 @@
from cdxj_indexer import iter_file_or_dir, buffering_record_iter

from warc2zim.url_rewriting import FUZZY_RULES, canonicalize
from warc2zim.items import (
WARCHeadersItem,
WARCPayloadItem,
StaticArticle,
from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle
from warc2zim.utils import (
get_version,
get_record_url,
get_record_mime_type,
parse_title,
Expand Down Expand Up @@ -555,7 +554,3 @@ def iter_warc_records(inputs):
for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
if record.rec_type in ("resource", "response", "revisit"):
yield record


def get_version():
return pkg_resources.get_distribution("warc2zim").version
30 changes: 1 addition & 29 deletions src/warc2zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from bs4 import BeautifulSoup

from warc2zim.url_rewriting import canonicalize
from warc2zim.utils import get_record_url, get_record_mime_type, parse_title

# Shared logger
logger = logging.getLogger("warc2zim.items")
Expand Down Expand Up @@ -123,32 +124,3 @@ def get_mimetype(self):

def get_hints(self):
return {Hint.FRONT_ARTICLE: False}


def get_record_url(record):
"""Check if record has url converted from POST/PUT, and if so, use that
otherwise return the target url"""
if hasattr(record, "urlkey"):
return record.urlkey
return record.rec_headers["WARC-Target-URI"]


def get_record_mime_type(record):
if record.http_headers:
# if the record has HTTP headers, use the Content-Type from those
# (eg. 'response' record)
content_type = record.http_headers["Content-Type"]
else:
# otherwise, use the Content-Type from WARC headers
content_type = record.rec_headers["Content-Type"]

mime = content_type or ""
return mime.split(";")[0]


def parse_title(content):
try:
soup = BeautifulSoup(content, "html.parser")
return soup.title.text or ""
except Exception:
return ""
3 changes: 2 additions & 1 deletion src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
import logging
from argparse import ArgumentParser

from warc2zim.converter import Converter, get_version
from warc2zim.converter import Converter
from warc2zim.utils import get_version

# Shared logger
logger = logging.getLogger("warc2zim")
Expand Down
39 changes: 39 additions & 0 deletions src/warc2zim/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu

import pkg_resources
from bs4 import BeautifulSoup


def get_version():
return pkg_resources.get_distribution("warc2zim").version


def get_record_url(record):
"""Check if record has url converted from POST/PUT, and if so, use that
otherwise return the target url"""
if hasattr(record, "urlkey"):
return record.urlkey
return record.rec_headers["WARC-Target-URI"]


def get_record_mime_type(record):
if record.http_headers:
# if the record has HTTP headers, use the Content-Type from those
# (eg. 'response' record)
content_type = record.http_headers["Content-Type"]
else:
# otherwise, use the Content-Type from WARC headers
content_type = record.rec_headers["Content-Type"]

mime = content_type or ""
return mime.split(";")[0]


def parse_title(content):
try:
soup = BeautifulSoup(content, "html.parser")
return soup.title.text or ""
except Exception:
return ""
6 changes: 2 additions & 4 deletions tests/test_warc_to_zim.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@

from warc2zim.url_rewriting import canonicalize
from warc2zim.converter import iter_warc_records
from warc2zim.items import get_record_url
from warc2zim.utils import get_record_url

# Import last to not mask warc2zim module
# Import last to not mask the warc2zim module
from warc2zim.main import warc2zim



TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")


Expand Down

0 comments on commit 57da2b2

Please sign in to comment.