Skip to content

Commit

Permalink
Properly sort WARC files and log total number of WARCs found
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Jul 24, 2024
1 parent 71c0328 commit 6d6b0ba
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 6 deletions.
19 changes: 14 additions & 5 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,15 @@ def __init__(self, args):
self.failed_content_path.mkdir(parents=True, exist_ok=True)

self.inputs = args.inputs

# sort by filename (not full path) alphabetically to process WARC by crawl time
# in general (at least when browsertrix crawler is used with zimit, not sure for
# **pure** warc2zim scenarii)
self.warc_files = sorted(
iter_file_or_dir(self.inputs), key=lambda filename: Path(filename).name
)
logger.debug(f"{len(self.warc_files)} WARC files found")

self.include_domains = args.include_domains

self.custom_css = args.custom_css
Expand Down Expand Up @@ -336,7 +345,7 @@ def run(self):
if self.custom_css:
self.add_custom_css_item()

for record in iter_warc_records(self.inputs):
for record in iter_warc_records(self.warc_files):
try:
self.add_items_for_warc_record(record)
except Exception as exc:
Expand Down Expand Up @@ -390,7 +399,7 @@ def run(self):

def gather_information_from_warc(self):
main_page_found = False
for record in iter_warc_records(self.inputs):
for record in iter_warc_records(self.warc_files):

# only response records can be considered as main_path and as existing ZIM
# path
Expand Down Expand Up @@ -650,7 +659,7 @@ def retrieve_illustration(self):

if self.favicon_url or self.favicon_path:
# look into WARC records
for record in iter_warc_records(self.inputs):
for record in iter_warc_records(self.warc_files):
if record.rec_type != "response":
continue
url = get_record_url(record)
Expand Down Expand Up @@ -805,9 +814,9 @@ def add_items_for_warc_record(self, record):
)


def iter_warc_records(inputs):
def iter_warc_records(warc_files):
"""iter warc records, including appending request data to matching response"""
for filename in iter_file_or_dir(inputs):
for filename in warc_files:
with open(filename, "rb") as fh:
for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
if record and record.rec_type in ("resource", "response", "revisit"):
Expand Down
7 changes: 6 additions & 1 deletion src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from warc2zim.utils import get_version


def main(raw_args=None):
def _create_arguments_parser() -> ArgumentParser:
parser = ArgumentParser(description="Create ZIM files from WARC files")

parser.add_argument("-V", "--version", action="version", version=get_version())
Expand Down Expand Up @@ -141,6 +141,11 @@ def main(raw_args=None):
default=False,
)

return parser


def main(raw_args=None):
parser = _create_arguments_parser()
args = parser.parse_args(args=raw_args)
converter = Converter(args)
return converter.run()
Expand Down
52 changes: 52 additions & 0 deletions tests/test_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import tempfile

import pytest

from warc2zim.converter import Converter
from warc2zim.main import _create_arguments_parser


@pytest.mark.parametrize(
"inputs, warc_files",
[
pytest.param([], [], id="empty_array"),
pytest.param(["foo.warc.gz"], ["foo.warc.gz"], id="one_file"),
pytest.param(
[
"rec-f9c30d949953-20240724035746176-0.warc.gz",
"rec-f9c30d949953-20240724045846176-0.warc.gz",
],
None, # no change
id="two_already_sorted",
),
pytest.param(
[
"rec-f9c30d949953-20240724045846176-0.warc.gz",
"rec-f9c30d949953-20240724035746176-0.warc.gz",
],
[
"rec-f9c30d949953-20240724035746176-0.warc.gz",
"rec-f9c30d949953-20240724045846176-0.warc.gz",
],
id="two_not_sorted",
),
pytest.param(
[
"aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz",
"bbb/rec-f9c30d949953-20240724035746176-0.warc.gz",
],
[
"bbb/rec-f9c30d949953-20240724035746176-0.warc.gz",
"aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz",
],
id="two_not_sorted_in_random_unsorted_dirs",
),
],
)
def test_sort_warc_files(inputs, warc_files):
parser = _create_arguments_parser()
tmpdir = tempfile.mkdtemp()
args = parser.parse_args(["--name", "foo", "--output", tmpdir])
args.inputs = inputs
conv = Converter(args)
assert conv.warc_files == (warc_files if warc_files else inputs)

0 comments on commit 6d6b0ba

Please sign in to comment.