Skip to content

Commit

Permalink
Introduce utils.py module to store small helpers.
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Nov 14, 2023
1 parent 6fa485b commit 94a9ce9
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 34 deletions.
7 changes: 2 additions & 5 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
from cdxj_indexer import iter_file_or_dir, buffering_record_iter

from .url_rewriting import FUZZY_RULES, canonicalize
from .items import WARCHeadersItem, WARCPayloadItem, StaticArticle, get_record_url, get_record_mime_type, parse_title
from .items import WARCHeadersItem, WARCPayloadItem, StaticArticle
from .utils import get_version, get_record_url, get_record_mime_type, parse_title

# Shared logger
logger = logging.getLogger("warc2zim.converter")
Expand Down Expand Up @@ -548,7 +549,3 @@ def iter_warc_records(inputs):
for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
if record.rec_type in ("resource", "response", "revisit"):
yield record


def get_version():
return pkg_resources.get_distribution("warc2zim").version
30 changes: 1 addition & 29 deletions src/warc2zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from zimscraperlib.zim.providers import StringProvider

from .url_rewriting import canonicalize
from .utils import get_record_url, get_record_mime_type, parse_title

# Shared logger
logger = logging.getLogger("warc2zim.items")
Expand Down Expand Up @@ -121,32 +122,3 @@ def get_mimetype(self):

def get_hints(self):
return {Hint.FRONT_ARTICLE: False}


def get_record_url(record):
"""Check if record has url converted from POST/PUT, and if so, use that
otherwise return the target url"""
if hasattr(record, "urlkey"):
return record.urlkey
return record.rec_headers["WARC-Target-URI"]


def get_record_mime_type(record):
if record.http_headers:
# if the record has HTTP headers, use the Content-Type from those
# (eg. 'response' record)
content_type = record.http_headers["Content-Type"]
else:
# otherwise, use the Content-Type from WARC headers
content_type = record.rec_headers["Content-Type"]

mime = content_type or ""
return mime.split(";")[0]


def parse_title(content):
try:
soup = BeautifulSoup(content, "html.parser")
return soup.title.text or ""
except Exception:
return ""
39 changes: 39 additions & 0 deletions src/warc2zim/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu

import pkg_resources
from bs4 import BeautifulSoup


def get_version():
return pkg_resources.get_distribution("warc2zim").version


def get_record_url(record):
"""Check if record has url converted from POST/PUT, and if so, use that
otherwise return the target url"""
if hasattr(record, "urlkey"):
return record.urlkey
return record.rec_headers["WARC-Target-URI"]


def get_record_mime_type(record):
if record.http_headers:
# if the record has HTTP headers, use the Content-Type from those
# (eg. 'response' record)
content_type = record.http_headers["Content-Type"]
else:
# otherwise, use the Content-Type from WARC headers
content_type = record.rec_headers["Content-Type"]

mime = content_type or ""
return mime.split(";")[0]


def parse_title(content):
try:
soup = BeautifulSoup(content, "html.parser")
return soup.title.text or ""
except Exception:
return ""

0 comments on commit 94a9ce9

Please sign in to comment.