Skip to content

Commit

Permalink
Move creator items into items.py.
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Nov 14, 2023
1 parent 28df4eb commit 9ea4ffd
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 140 deletions.
152 changes: 152 additions & 0 deletions src/warc2zim/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu

""" warc2zim's item classes
This module contains the differents Item we may want to add to a Zim archive.
"""

import logging
import re

import pkg_resources
from libzim.writer import Hint
from zimscraperlib.types import get_mime_for_name
from zimscraperlib.zim.items import StaticItem
from zimscraperlib.zim.providers import StringProvider

from .url_rewriting import canonicalize

# Shared logger
logger = logging.getLogger("warc2zim.items")

# external sw.js filename
SW_JS = "sw.js"

HEAD_INS = re.compile(b"(<head>)", re.I)
CSS_INS = re.compile(b"(</head>)", re.I)


class WARCHeadersItem(StaticItem):
"""WARCHeadersItem used to store the WARC + HTTP headers as text
Usually stored under H namespace
"""

def __init__(self, record):
super().__init__()
self.record = record
self.url = get_record_url(record)

def get_path(self):
return "H/" + canonicalize(self.url)

def get_title(self):
return ""

def get_mimetype(self):
return "application/warc-headers"

def get_hints(self):
return {Hint.FRONT_ARTICLE: False}

def get_contentprovider(self):
# add WARC headers
buff = self.record.rec_headers.to_bytes(encoding="utf-8")
# add HTTP headers, if present
if self.record.http_headers:
buff += self.record.http_headers.to_bytes(encoding="utf-8")

return StringProvider(content=buff, ref=self)


class WARCPayloadItem(StaticItem):
"""WARCPayloadItem used to store the WARC payload
Usually stored under A namespace
"""

def __init__(self, record, head_insert=None, css_insert=None):
super().__init__()
self.record = record
self.url = get_record_url(record)
self.mimetype = get_record_mime_type(record)
self.title = ""

if hasattr(self.record, "buffered_stream"):
self.record.buffered_stream.seek(0)
self.content = self.record.buffered_stream.read()
else:
self.content = self.record.content_stream().read()

if self.mimetype.startswith("text/html"):
self.title = parse_title(self.content)
if head_insert:
self.content = HEAD_INS.sub(head_insert, self.content)
if css_insert:
self.content = CSS_INS.sub(css_insert, self.content)

def get_path(self):
return "A/" + canonicalize(self.url)

def get_title(self):
return self.title

def get_hints(self):
is_front = self.mimetype.startswith("text/html")
return {Hint.FRONT_ARTICLE: is_front}


class StaticArticle(StaticItem):
def __init__(self, env, filename, main_url, **kwargs):
super().__init__(**kwargs)
self.filename = filename
self.main_url = main_url

self.mime = get_mime_for_name(filename)
self.mime = self.mime or "application/octet-stream"

if filename != SW_JS:
template = env.get_template(filename)
self.content = template.render(MAIN_URL=self.main_url)
else:
self.content = pkg_resources.resource_string(
"warc2zim", "templates/" + filename
).decode("utf-8")

def get_path(self):
return "A/" + self.filename

def get_mimetype(self):
return self.mime

def get_hints(self):
return {Hint.FRONT_ARTICLE: False}


def get_record_url(record):
"""Check if record has url converted from POST/PUT, and if so, use that
otherwise return the target url"""
if hasattr(record, "urlkey"):
return record.urlkey
return record.rec_headers["WARC-Target-URI"]


def get_record_mime_type(record):
if record.http_headers:
# if the record has HTTP headers, use the Content-Type from those
# (eg. 'response' record)
content_type = record.http_headers["Content-Type"]
else:
# otherwise, use the Content-Type from WARC headers
content_type = record.rec_headers["Content-Type"]

mime = content_type or ""
return mime.split(";")[0]


def parse_title(content):
try:
soup = BeautifulSoup(content, "html.parser")
return soup.title.text or ""
except Exception:
return ""
142 changes: 2 additions & 140 deletions src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,23 @@

import pkg_resources
import requests
from libzim.writer import Hint
from warcio import ArchiveIterator, StatusAndHeaders
from warcio.recordbuilder import RecordBuilder
from zimscraperlib.constants import DEFAULT_DEV_ZIM_METADATA
from zimscraperlib.download import stream_file
from zimscraperlib.types import get_mime_for_name
from zimscraperlib.i18n import setlocale, get_language_details, Locale
from zimscraperlib.image.convertion import convert_image
from zimscraperlib.image.transformation import resize_image
from zimscraperlib.zim.creator import Creator
from zimscraperlib.zim.items import StaticItem, URLItem
from zimscraperlib.zim.providers import StringProvider
from zimscraperlib.zim.items import URLItem
from bs4 import BeautifulSoup

from jinja2 import Environment, PackageLoader

from cdxj_indexer import iter_file_or_dir, buffering_record_iter

from .url_rewriting import FUZZY_RULES, canonicalize
from .items import WARCHeadersItem, WARCPayloadItem, StaticArticle, get_record_url, get_record_mime_type, parse_title

# Shared logger
logger = logging.getLogger("warc2zim")
Expand All @@ -67,11 +65,6 @@
# head insert template
HEAD_INSERT_FILE = "sw_check.html"


HEAD_INS = re.compile(b"(<head>)", re.I)
CSS_INS = re.compile(b"(</head>)", re.I)


# Default ZIM metadata tags
DEFAULT_TAGS = ["_ftindex:yes", "_category:other", "_sw:yes"]

Expand All @@ -84,105 +77,6 @@
re.MULTILINE | re.DOTALL,
)


# ============================================================================
class WARCHeadersItem(StaticItem):
"""WARCHeadersItem used to store the WARC + HTTP headers as text
Usually stored under H namespace
"""

def __init__(self, record):
super().__init__()
self.record = record
self.url = get_record_url(record)

def get_path(self):
return "H/" + canonicalize(self.url)

def get_title(self):
return ""

def get_mimetype(self):
return "application/warc-headers"

def get_hints(self):
return {Hint.FRONT_ARTICLE: False}

def get_contentprovider(self):
# add WARC headers
buff = self.record.rec_headers.to_bytes(encoding="utf-8")
# add HTTP headers, if present
if self.record.http_headers:
buff += self.record.http_headers.to_bytes(encoding="utf-8")

return StringProvider(content=buff, ref=self)


# ============================================================================
class WARCPayloadItem(StaticItem):
"""WARCPayloadItem used to store the WARC payload
Usually stored under A namespace
"""

def __init__(self, record, head_insert=None, css_insert=None):
super().__init__()
self.record = record
self.url = get_record_url(record)
self.mimetype = get_record_mime_type(record)
self.title = ""

if hasattr(self.record, "buffered_stream"):
self.record.buffered_stream.seek(0)
self.content = self.record.buffered_stream.read()
else:
self.content = self.record.content_stream().read()

if self.mimetype.startswith("text/html"):
self.title = parse_title(self.content)
if head_insert:
self.content = HEAD_INS.sub(head_insert, self.content)
if css_insert:
self.content = CSS_INS.sub(css_insert, self.content)

def get_path(self):
return "A/" + canonicalize(self.url)

def get_title(self):
return self.title

def get_hints(self):
is_front = self.mimetype.startswith("text/html")
return {Hint.FRONT_ARTICLE: is_front}


# ============================================================================
class StaticArticle(StaticItem):
def __init__(self, env, filename, main_url, **kwargs):
super().__init__(**kwargs)
self.filename = filename
self.main_url = main_url

self.mime = get_mime_for_name(filename)
self.mime = self.mime or "application/octet-stream"

if filename != SW_JS:
template = env.get_template(filename)
self.content = template.render(MAIN_URL=self.main_url)
else:
self.content = pkg_resources.resource_string(
"warc2zim", "templates/" + filename
).decode("utf-8")

def get_path(self):
return "A/" + self.filename

def get_mimetype(self):
return self.mime

def get_hints(self):
return {Hint.FRONT_ARTICLE: False}


# ============================================================================
class WARC2Zim:
def __init__(self, args):
Expand Down Expand Up @@ -648,38 +542,6 @@ def add_fuzzy_match_record(self, url):
logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url))


# ============================================================================
def get_record_url(record):
"""Check if record has url converted from POST/PUT, and if so, use that
otherwise return the target url"""
if hasattr(record, "urlkey"):
return record.urlkey
return record.rec_headers["WARC-Target-URI"]


# ============================================================================
def get_record_mime_type(record):
if record.http_headers:
# if the record has HTTP headers, use the Content-Type from those
# (eg. 'response' record)
content_type = record.http_headers["Content-Type"]
else:
# otherwise, use the Content-Type from WARC headers
content_type = record.rec_headers["Content-Type"]

mime = content_type or ""
return mime.split(";")[0]


# ============================================================================
def parse_title(content):
try:
soup = BeautifulSoup(content, "html.parser")
return soup.title.text or ""
except Exception:
return ""


# ============================================================================
def iter_warc_records(inputs):
"""iter warc records, including appending request data to matching response"""
Expand Down

0 comments on commit 9ea4ffd

Please sign in to comment.