Properly sort WARC files and log total number of WARCs found

openzim · Jul 24, 2024 · 6d6b0ba · 6d6b0ba
1 parent 71c0328
commit 6d6b0ba
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 6 deletions.
diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
@@ -188,6 +188,15 @@ def __init__(self, args):
         self.failed_content_path.mkdir(parents=True, exist_ok=True)
 
         self.inputs = args.inputs
+
+        # sort by filename (not full path) alphabetically to process WARC by crawl time
+        # in general (at least when browsertrix crawler is used with zimit, not sure for
+        # **pure** warc2zim scenarii)
+        self.warc_files = sorted(
+            iter_file_or_dir(self.inputs), key=lambda filename: Path(filename).name
+        )
+        logger.debug(f"{len(self.warc_files)} WARC files found")
+
         self.include_domains = args.include_domains
 
         self.custom_css = args.custom_css
@@ -336,7 +345,7 @@ def run(self):
         if self.custom_css:
             self.add_custom_css_item()
 
-        for record in iter_warc_records(self.inputs):
+        for record in iter_warc_records(self.warc_files):
             try:
                 self.add_items_for_warc_record(record)
             except Exception as exc:
@@ -390,7 +399,7 @@ def run(self):
 
     def gather_information_from_warc(self):
         main_page_found = False
-        for record in iter_warc_records(self.inputs):
+        for record in iter_warc_records(self.warc_files):
 
             # only response records can be considered as main_path and as existing ZIM
             # path
@@ -650,7 +659,7 @@ def retrieve_illustration(self):
 
         if self.favicon_url or self.favicon_path:
             # look into WARC records
-            for record in iter_warc_records(self.inputs):
+            for record in iter_warc_records(self.warc_files):
                 if record.rec_type != "response":
                     continue
                 url = get_record_url(record)
@@ -805,9 +814,9 @@ def add_items_for_warc_record(self, record):
             )
 
 
-def iter_warc_records(inputs):
+def iter_warc_records(warc_files):
     """iter warc records, including appending request data to matching response"""
-    for filename in iter_file_or_dir(inputs):
+    for filename in warc_files:
         with open(filename, "rb") as fh:
             for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
                 if record and record.rec_type in ("resource", "response", "revisit"):

diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py
@@ -8,7 +8,7 @@
 from warc2zim.utils import get_version
 
 
-def main(raw_args=None):
+def _create_arguments_parser() -> ArgumentParser:
     parser = ArgumentParser(description="Create ZIM files from WARC files")
 
     parser.add_argument("-V", "--version", action="version", version=get_version())
@@ -141,6 +141,11 @@ def main(raw_args=None):
         default=False,
     )
 
+    return parser
+
+
+def main(raw_args=None):
+    parser = _create_arguments_parser()
     args = parser.parse_args(args=raw_args)
     converter = Converter(args)
     return converter.run()

diff --git a/tests/test_converter.py b/tests/test_converter.py
@@ -0,0 +1,52 @@
+import tempfile
+
+import pytest
+
+from warc2zim.converter import Converter
+from warc2zim.main import _create_arguments_parser
+
+
+@pytest.mark.parametrize(
+    "inputs, warc_files",
+    [
+        pytest.param([], [], id="empty_array"),
+        pytest.param(["foo.warc.gz"], ["foo.warc.gz"], id="one_file"),
+        pytest.param(
+            [
+                "rec-f9c30d949953-20240724035746176-0.warc.gz",
+                "rec-f9c30d949953-20240724045846176-0.warc.gz",
+            ],
+            None,  # no change
+            id="two_already_sorted",
+        ),
+        pytest.param(
+            [
+                "rec-f9c30d949953-20240724045846176-0.warc.gz",
+                "rec-f9c30d949953-20240724035746176-0.warc.gz",
+            ],
+            [
+                "rec-f9c30d949953-20240724035746176-0.warc.gz",
+                "rec-f9c30d949953-20240724045846176-0.warc.gz",
+            ],
+            id="two_not_sorted",
+        ),
+        pytest.param(
+            [
+                "aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz",
+                "bbb/rec-f9c30d949953-20240724035746176-0.warc.gz",
+            ],
+            [
+                "bbb/rec-f9c30d949953-20240724035746176-0.warc.gz",
+                "aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz",
+            ],
+            id="two_not_sorted_in_random_unsorted_dirs",
+        ),
+    ],
+)
+def test_sort_warc_files(inputs, warc_files):
+    parser = _create_arguments_parser()
+    tmpdir = tempfile.mkdtemp()
+    args = parser.parse_args(["--name", "foo", "--output", tmpdir])
+    args.inputs = inputs
+    conv = Converter(args)
+    assert conv.warc_files == (warc_files if warc_files else inputs)