From 3cced6690e6be1c48d263dd9000d9b6524f845f7 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 20 Dec 2024 14:27:56 +0000 Subject: [PATCH] Remove unused code/tests + 'modernize' --- CHANGELOG.md | 1 + docs/software_architecture.md | 2 + pyproject.toml | 2 +- src/warc2zim/cdxj_indexer/__init__.py | 12 +- src/warc2zim/cdxj_indexer/amf.py | 84 --- src/warc2zim/cdxj_indexer/bufferiter.py | 16 +- src/warc2zim/cdxj_indexer/main.py | 544 +------------------ src/warc2zim/cdxj_indexer/postquery.py | 38 +- src/warc2zim/converter.py | 2 +- tests/cdxj_indexer/data/bad.arc | 11 - tests/cdxj_indexer/data/cc.warc.gz | Bin 6022 -> 0 bytes tests/cdxj_indexer/data/example.arc | 69 --- tests/cdxj_indexer/data/example.warc.gz | Bin 3829 -> 0 bytes tests/cdxj_indexer/data/missing-http.warc.gz | Bin 602 -> 0 bytes tests/cdxj_indexer/data/post-test-more.warc | Bin 3426 -> 0 bytes tests/cdxj_indexer/data/post-test.warc.gz | Bin 3593 -> 0 bytes tests/cdxj_indexer/test_indexer.py | 373 ------------- tests/cdxj_indexer/test_postappend.py | 21 +- 18 files changed, 44 insertions(+), 1131 deletions(-) delete mode 100644 src/warc2zim/cdxj_indexer/amf.py delete mode 100644 tests/cdxj_indexer/data/bad.arc delete mode 100644 tests/cdxj_indexer/data/cc.warc.gz delete mode 100644 tests/cdxj_indexer/data/example.arc delete mode 100644 tests/cdxj_indexer/data/example.warc.gz delete mode 100644 tests/cdxj_indexer/data/missing-http.warc.gz delete mode 100644 tests/cdxj_indexer/data/post-test-more.warc delete mode 100644 tests/cdxj_indexer/data/post-test.warc.gz delete mode 100644 tests/cdxj_indexer/test_indexer.py diff --git a/CHANGELOG.md b/CHANGELOG.md index eb85a145..3629f037 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Upgrade to wombat 3.8.6 (#334) - Fix wombat setup settings (especially `isSW`) (#293) +- Fork cdxj_indexer codebase (#428) ### Fixed diff --git a/docs/software_architecture.md b/docs/software_architecture.md index 4a12b56c..afe2d8c6 100644 --- a/docs/software_architecture.md +++ b/docs/software_architecture.md @@ -35,6 +35,8 @@ It provide two main features: Except that, scraper directly uses WarcRecord (returned by cdxj_indexer, implemented in warcio) to access metadata and such. +cdxj_indexer usefull methods are currently forked in warc2zim, see https://github.com/openzim/warc2zim/pull/428 for details. + ## zimscraperlib [zimscraperlib Python library](https://pypi.org/project/zimscraperlib) is used for ZIM operations. diff --git a/pyproject.toml b/pyproject.toml index 8e0d7b34..ca2af13d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,11 +17,11 @@ dependencies = [ "jinja2==3.1.4", # also update version in build-system above and in build_js.sh # to support possible brotli content in warcs, must be added separately "brotlipy==0.7.0", - "cdxj_indexer==1.4.6", "tinycss2==1.4.0", "beautifulsoup4==4.12.3", # used to parse base href "lxml==5.3.0", # used by beautifulsoup4 for parsing html "python-dateutil==2.9.0.post0", + "multipart==1.2.1", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/warc2zim/cdxj_indexer/__init__.py b/src/warc2zim/cdxj_indexer/__init__.py index ab718151..14245da7 100644 --- a/src/warc2zim/cdxj_indexer/__init__.py +++ b/src/warc2zim/cdxj_indexer/__init__.py @@ -1,3 +1,9 @@ -from cdxj_indexer.main import CDXJIndexer, iter_file_or_dir -from cdxj_indexer.postquery import append_method_query_from_req_resp -from cdxj_indexer.bufferiter import buffering_record_iter +from warc2zim.cdxj_indexer.bufferiter import buffering_record_iter +from warc2zim.cdxj_indexer.main import iter_file_or_dir +from warc2zim.cdxj_indexer.postquery import append_method_query_from_req_resp + +__all__ = [ + "append_method_query_from_req_resp", + "buffering_record_iter", + "iter_file_or_dir", +] diff --git a/src/warc2zim/cdxj_indexer/amf.py b/src/warc2zim/cdxj_indexer/amf.py deleted file mode 100644 index 111fb325..00000000 --- a/src/warc2zim/cdxj_indexer/amf.py +++ /dev/null @@ -1,84 +0,0 @@ -import json -import six -from pyamf.remoting import Envelope, Request, decode -from pyamf.flex.messaging import RemotingMessage -from io import BytesIO -from six.moves.urllib.parse import urlencode - - -class Amf: - @staticmethod - def get_representation(request_object, max_calls=500): - - max_calls = max_calls - 1 - - if max_calls < 0: - raise Exception("Amf.get_representation maximum number of calls reached") - - if isinstance(request_object, Envelope): - # Remove order of Request - bodies = [] - for i in request_object.bodies: - bodies.append(Amf.get_representation(i[1], max_calls)) - bodies = sorted(bodies) - - return "{bodies}".format( - bodies="[" + ",".join(bodies) + "]" - ) - - elif isinstance(request_object, Request): - # Remove cyclic reference - target = request_object.target - body = Amf.get_representation(request_object.body, max_calls) - return "{body}".format(**locals()) - - elif isinstance(request_object, RemotingMessage): - # Remove random properties - operation = request_object.operation - body = Amf.get_representation(request_object.body, max_calls) - return "{body}".format( - **locals() - ) - - elif isinstance(request_object, dict): - return json.dumps(request_object, sort_keys=True) - - elif isinstance(request_object, list): - bodies = [] - for i in request_object: - bodies.append(Amf.get_representation(i, max_calls)) - return "[" + ",".join(bodies) + "]" - - elif isinstance(request_object, six.string_types): - return request_object - - elif request_object is None: - return "" - - elif isinstance(request_object, object) and hasattr(request_object, "__dict__"): - classname = request_object.__class__.__name__ - properties = request_object.__dict__ - bodies = dict() - for prop in properties: - bodies[prop] = Amf.get_representation( - getattr(request_object, prop), max_calls - ) - bodies = Amf.get_representation(bodies, max_calls) - - return "<{classname}>{bodies}".format(**locals()) - - else: - return repr(request_object) - - -def amf_parse(string): - try: - res = decode(BytesIO(string)) - return urlencode({"request": Amf.get_representation(res)}) - - except Exception as e: - import traceback - - traceback.print_exc() - print(e) - return None diff --git a/src/warc2zim/cdxj_indexer/bufferiter.py b/src/warc2zim/cdxj_indexer/bufferiter.py index 52bff534..9407e313 100644 --- a/src/warc2zim/cdxj_indexer/bufferiter.py +++ b/src/warc2zim/cdxj_indexer/bufferiter.py @@ -2,15 +2,14 @@ import shutil import tempfile -from cdxj_indexer.postquery import append_method_query_from_req_resp - +from warc2zim.cdxj_indexer.postquery import append_method_query_from_req_resp BUFF_SIZE = 1024 * 64 # ============================================================================ def buffering_record_iter( - record_iter, post_append=False, digest_reader=None, url_key_func=None + record_iter, digest_reader=None, url_key_func=None, *, post_append=False ): prev_record = None @@ -30,10 +29,8 @@ def buffering_record_iter( if digest_length != record.file_length: raise Exception( - "Digest block mismatch, expected {0}, got {1}".format( - record.file_length, - digest_length, - ) + f"Digest block mismatch, expected {record.file_length}, " + f"got {digest_length}" ) record.record_digest = record_digest @@ -50,7 +47,8 @@ def buffering_record_iter( join_req_resp(req, resp, post_append, url_key_func) yield prev_record - prev_record.buffered_stream.close() + if prev_record: + prev_record.buffered_stream.close() yield record record.buffered_stream.close() prev_record = None @@ -107,7 +105,7 @@ def join_req_resp(req, resp, post_append, url_key_func=None): method = req.http_headers.protocol if post_append and method.upper() in ("POST", "PUT"): url = req.rec_headers.get_header("WARC-Target-URI") - query, append_str = append_method_query_from_req_resp(req, resp) + query, append_str = append_method_query_from_req_resp(req) resp.method = method.upper() resp.requestBody = query resp.urlkey = url + append_str diff --git a/src/warc2zim/cdxj_indexer/main.py b/src/warc2zim/cdxj_indexer/main.py index 208a7cb3..8f9fab2d 100644 --- a/src/warc2zim/cdxj_indexer/main.py +++ b/src/warc2zim/cdxj_indexer/main.py @@ -1,553 +1,17 @@ -from __future__ import absolute_import -import json -import surt -import logging import os -import re -import sys -import zlib -import hashlib -import heapq -from argparse import ArgumentParser, RawTextHelpFormatter -from io import BytesIO -from copy import copy -from tempfile import NamedTemporaryFile - - -from warcio.indexer import Indexer -from warcio.timeutils import iso_date_to_timestamp -from warcio.warcwriter import BufferWARCWriter -from warcio.archiveiterator import ArchiveIterator -from warcio.utils import open_or_default - -from cdxj_indexer.bufferiter import buffering_record_iter, BUFF_SIZE - - -# ============================================================================ -class CDXJIndexer(Indexer): - field_names = { - "warc-target-uri": "url", - "http:status": "status", - "warc-payload-digest": "digest", - "req.http:referer": "referrer", - "req.http:method": "method", - "record-digest": "recordDigest", - } - - inv_field_names = {k: v for v, k in field_names.items()} - - DEFAULT_FIELDS = [ - "warc-target-uri", - "mime", - "http:status", - "warc-payload-digest", - "length", - "offset", - "filename", - ] - - DEFAULT_RECORDS = ["response", "revisit", "resource", "metadata"] - - ALLOWED_EXT = (".arc", ".arc.gz", ".warc", ".warc.gz") - - RE_SPACE = re.compile(r"[;\s]") - - DEFAULT_NUM_LINES = 300 - - def __init__( - self, - output, - inputs, - post_append=False, - sort=False, - compress=None, - lines=DEFAULT_NUM_LINES, - max_sort_buff_size=None, - data_out_name=None, - filename=None, - fields=None, - replace_fields=None, - records=None, - verify_http=False, - dir_root=None, - digest_records=False, - **kwargs - ): - - if isinstance(inputs, str) or hasattr(inputs, "read"): - inputs = [inputs] - - inputs = iter_file_or_dir(inputs) - - self.digest_records = digest_records - fields = self._parse_fields(fields, replace_fields) - - super(CDXJIndexer, self).__init__( - fields, inputs, output, verify_http=verify_http - ) - self.writer = None - - self.curr_filename = None - self.force_filename = filename - self.post_append = post_append - self.dir_root = dir_root - - self.num_lines = lines - self.max_sort_buff_size = max_sort_buff_size - self.sort = sort - self.compress = compress - self.data_out_name = data_out_name - - self.include_records = records - if self.include_records == "all": - self.include_records = None - elif self.include_records: - self.include_records = self.include_records.split(",") - else: - self.include_records = self.DEFAULT_RECORDS - - self.collect_records = self.post_append or any( - field.startswith("req.http:") for field in self.fields - ) - self.record_parse = True - - def _parse_fields(self, fields=None, replace_fields=None): - add_fields = replace_fields - if add_fields: - fields = [] - else: - add_fields = fields - fields = copy(self.DEFAULT_FIELDS) - - if self.digest_records and "record-digest" not in fields: - fields.append("record-digest") - - if add_fields: - add_fields = add_fields.split(",") - for field in add_fields: - fields.append(self.inv_field_names.get(field, field)) - - return fields - - def get_field(self, record, name, it, filename): - if name == "mime": - if record.rec_type == "revisit": - return "warc/revisit" - elif record.rec_type in ("response", "request"): - name = "http:content-type" - else: - name = "content-type" - - value = super(CDXJIndexer, self).get_field(record, name, it, filename) - if value: - value = self.RE_SPACE.split(value, 1)[0].strip() - - return value - - if name == "filename": - return self.curr_filename - - if self.collect_records: - if name == "offset": - return str(record.file_offset) - elif name == "length": - return str(record.file_length) - elif name == "record-digest": - return str(record.record_digest) - elif name.startswith("req.http:"): - value = self._get_req_field(name, record) - if value: - return value - - value = super(CDXJIndexer, self).get_field(record, name, it, filename) - - if name == "warc-payload-digest": - value = self._get_digest(record, name) - - return value - - def _get_req_field(self, name, record): - if hasattr(record, "req"): - req = record.req - elif record.rec_type == "request": - req = record - else: - return None - - if name == "req.http:method": - return req.http_headers.protocol - else: - return req.http_headers.get_header(name[9:]) - - def process_all(self): - data_out = None - - with open_or_default(self.output, "wt", sys.stdout) as fh: - if self.compress: - if isinstance(self.compress, str): - data_out = open(self.compress, "wb") - if os.path.splitext(self.compress)[1] == "": - self.compress += ".cdxj.gz" - - fh = CompressedWriter( - fh, - data_out=data_out, - data_out_name=self.compress, - num_lines=self.num_lines, - digest_records=self.digest_records, - ) - else: - fh = CompressedWriter( - fh, - data_out=self.compress, - data_out_name=self.data_out_name, - num_lines=self.num_lines, - digest_records=self.digest_records, - ) - - if self.sort: - fh = SortingWriter(fh, self.max_sort_buff_size) - - self.output = fh - - super().process_all() - - if self.sort or self.compress: - fh.flush() - if data_out: - data_out.close() - - def _resolve_rel_path(self, filename): - if not self.dir_root: - return os.path.basename(filename) - - path = os.path.relpath(filename, self.dir_root) - if os.path.sep != "/": # pragma: no cover - path = path.replace(os.path.sep, "/") - return path - - def process_one(self, input_, output, filename): - self.curr_filename = self.force_filename or self._resolve_rel_path(filename) - - it = self._create_record_iter(input_) - - self._write_header(output, filename) - - if self.collect_records: - digest_reader = input_ if self.digest_records else None - wrap_it = buffering_record_iter( - it, - post_append=self.post_append, - digest_reader=digest_reader, - url_key_func=self.get_url_key, - ) - else: - wrap_it = it - - for record in wrap_it: - if not self.include_records or self.filter_record(record): - self.process_index_entry(it, record, filename, output) - - def filter_record(self, record): - if not record.rec_type in self.include_records: - return False - - if ( - self.include_records == self.DEFAULT_RECORDS - and record.rec_type in ("resource", "metadata") - and record.rec_headers.get_header("Content-Type") - == "application/warc-fields" - ): - return False - - return True - - def _get_digest(self, record, name): - value = record.rec_headers.get(name) - if not value: - if not self.writer: - self.writer = BufferWARCWriter() - - self.writer.ensure_digest(record, block=False, payload=True) - value = record.rec_headers.get(name) - - return value - - def _write_line(self, out, index, record, filename): - url = index.get("url") - if not url: - url = record.rec_headers.get("WARC-Target-URI") - - dt = record.rec_headers.get("WARC-Date") - - ts = iso_date_to_timestamp(dt) - - if hasattr(record, "urlkey"): - urlkey = record.urlkey - else: - urlkey = self.get_url_key(url) - - if hasattr(record, "requestBody"): - index["requestBody"] = record.requestBody - if hasattr(record, "method"): - index["method"] = record.method - - self._do_write(urlkey, ts, index, out) - - def _do_write(self, urlkey, ts, index, out): - out.write(urlkey + " " + ts + " " + json.dumps(index) + "\n") - - def get_url_key(self, url): - try: - return surt.surt(url) - except: # pragma: no coverage - return url - - -# ============================================================================ -class CDXLegacyIndexer(CDXJIndexer): - def _do_write(self, urlkey, ts, index, out): - index["urlkey"] = urlkey - index["timestamp"] = ts - - line = " ".join(index.get(field, "-") for field in self.CDX_FIELDS) - out.write(line + "\n") - - def _write_header(self, out, filename): - out.write(self.CDX_HEADER + "\n") - - def get_field(self, record, name, it, filename): - value = super().get_field(record, name, it, filename) - - if name == "warc-payload-digest": - value = value.split(":")[-1] - - return value - - -# ============================================================================ -class CDX11Indexer(CDXLegacyIndexer): - CDX_HEADER = " CDX N b a m s k r M S V g" - - CDX_FIELDS = [ - "urlkey", - "timestamp", - "url", - "mime", - "status", - "digest", - "redirect", - "meta", - "length", - "offset", - "filename", - ] - - -# ============================================================================ -class CDX09Indexer(CDXLegacyIndexer): - CDX_HEADER = " CDX N b a m s k r V g" - - CDX_FIELDS = [ - "urlkey", - "timestamp", - "url", - "mime", - "status", - "digest", - "redirect", - "offset", - "filename", - ] - - -# ============================================================================ -class SortingWriter: - MAX_SORT_BUFF_SIZE = 1024 * 1024 * 32 - - def __init__(self, out, max_sort_buff_size=None): - self.out = out - self.sortedlist = [] - self.count = 0 - self.max_sort_buff_size = max_sort_buff_size or self.MAX_SORT_BUFF_SIZE - - self.tmp_files = [] - - def write(self, line): - self.sortedlist.append(line) - self.count += len(line) - - if self.count > self.max_sort_buff_size: - self.tmp_files.append(self.write_to_temp()) - self.sortedlist = [] - self.count = 0 - - def flush(self): - if not len(self.tmp_files): - self.sortedlist.sort() - self.write_to_file(self.sortedlist, self.out) - return - - if len(self.sortedlist) > 0: - self.tmp_files.append(self.write_to_temp()) - self.sortedlist = [] - self.count = 0 - - open_files = [open(name, "rt", encoding="utf-8") for name in self.tmp_files] - - self.write_to_file(heapq.merge(*open_files), self.out) - - for out, name in zip(open_files, self.tmp_files): - out.close() - os.remove(name) - - def write_to_temp(self): - self.sortedlist.sort() - with NamedTemporaryFile(mode="wt", delete=False) as out: - self.write_to_file(self.sortedlist, out) - - return out.name - - def write_to_file(self, iter_, out): - lastline = None - for line in iter_: - if lastline != line: - out.write(line) - lastline = line - - out.flush() - - -# ============================================================================ -class CompressedWriter: - def __init__( - self, - index_out, - data_out, - num_lines=CDXJIndexer.DEFAULT_NUM_LINES, - data_out_name="", - digest_records=False, - ): - self.index_out = index_out - self.data_out = data_out - self.data_out_name = data_out_name - self.digest_records = digest_records - - self.block = [] - self.offset = 0 - self.prefix = "" - self.num_lines = num_lines - - def write_header(self): - meta = json.dumps({"format": "cdxj-gzip-1.0", "filename": self.data_out_name}) - - self.index_out.write("!meta 0 {0}\n".format(meta)) - - def write(self, line): - if not len(self.block): - self.prefix = line.split("{", 1)[0].strip() - if not self.offset: - self.write_header() - - self.block.append(line) - - if len(self.block) == self.num_lines: - self.flush() - - def get_index_json(self, length, digest): - data = {"offset": self.offset, "length": length} - if digest: - data["digest"] = digest - - return json.dumps(data) + "\n" - - def flush(self): - comp = zlib.compressobj(wbits=16 + zlib.MAX_WBITS) - compressed = comp.compress("".join(self.block).encode("utf-8")) - compressed += comp.flush() - - length = len(compressed) - digest = ( - "sha256:" + hashlib.sha256(compressed).hexdigest() - if self.digest_records - else None - ) - line = self.prefix + " " + self.get_index_json(length, digest) - self.index_out.write(line) - self.data_out.write(compressed) - self.offset += length - self.block = [] - - -# ============================================================================ -def main(args=None): - parser = ArgumentParser( - description="cdx_indexer", formatter_class=RawTextHelpFormatter - ) - - parser.add_argument("inputs", nargs="+") - parser.add_argument("-o", "--output") - - group = parser.add_mutually_exclusive_group() - group.add_argument("-9", "--cdx09", action="store_true") - - group.add_argument("-11", "--cdx11", action="store_true") - - group.add_argument("-f", "--fields") - group.add_argument("-rf", "--replace-fields") - - parser.add_argument("--records") - - parser.add_argument("--dir-root") - - parser.add_argument("-p", "--post-append", action="store_true") - - parser.add_argument("-s", "--sort", action="store_true") - - parser.add_argument("-c", "--compress") - - parser.add_argument( - "-l", "--lines", type=int, default=CDXJIndexer.DEFAULT_NUM_LINES - ) - - parser.add_argument("-d", "--digest-records", action="store_true") - - cmd = parser.parse_args(args=args) - - write_cdx_index(cmd.output, cmd.inputs, vars(cmd)) - - -def write_cdx_index(output, inputs, opts): - if opts.get("cdx11"): - cls = CDX11Indexer - elif opts.get("cdx09"): - cls = CDX09Indexer - else: - cls = CDXJIndexer - - opts.pop("output", "") - opts.pop("inputs", "") - - indexer = cls(output, inputs, **opts) - indexer.process_all() - return indexer +ALLOWED_EXT = (".arc", ".arc.gz", ".warc", ".warc.gz") # ================================================================= -def iter_file_or_dir(inputs, recursive=True): +def iter_file_or_dir(inputs: list[str]): for input_ in inputs: if not isinstance(input_, str) or not os.path.isdir(input_): yield input_ continue - for root, dirs, files in os.walk(input_): + for root, _, files in os.walk(input_): for filename in files: - if filename.endswith(CDXJIndexer.ALLOWED_EXT): + if filename.endswith(ALLOWED_EXT): full_path = os.path.join(root, filename) yield full_path - - -# ============================================================================ -if __name__ == "__main__": # pragma: no cover - main() diff --git a/src/warc2zim/cdxj_indexer/postquery.py b/src/warc2zim/cdxj_indexer/postquery.py index 0d93759d..2142b14d 100644 --- a/src/warc2zim/cdxj_indexer/postquery.py +++ b/src/warc2zim/cdxj_indexer/postquery.py @@ -1,21 +1,16 @@ import base64 import json import sys - -from urllib.parse import unquote_plus, urlencode from io import BytesIO +from urllib.parse import unquote_plus, urlencode from multipart import MultipartParser -from warcio.utils import to_native_str - -from cdxj_indexer.amf import amf_parse - MAX_QUERY_LENGTH = 4096 # ============================================================================ -def append_method_query_from_req_resp(req, resp): +def append_method_query_from_req_resp(req): len_ = req.http_headers.get_header("Content-Length") content_type = req.http_headers.get_header("Content-Type") stream = req.buffered_stream @@ -31,8 +26,8 @@ def append_method_query(method, content_type, len_, stream, url): # if method == 'GET': # return '', '' - if method == "POST" or method == "PUT": - query = query_extract(content_type, len_, stream, url) + if method in ("POST", "PUT"): + query = query_extract(content_type, len_, stream) else: query = "" @@ -49,7 +44,7 @@ def append_method_query(method, content_type, len_, stream, url): # ============================================================================ -def query_extract(mime, length, stream, url): +def query_extract(mime, length, stream): """ Extract a url-encoded form POST/PUT from stream content length, return None @@ -82,15 +77,11 @@ def query_extract(mime, length, stream, url): query = "" def handle_binary(query_data): - query = base64.b64encode(query_data) - query = to_native_str(query) - query = "__wb_post_data=" + query - return query + return f"__wb_post_data={ base64.b64encode(query_data).decode()}" if mime.startswith("application/x-www-form-urlencoded"): try: - query = to_native_str(query_data.decode("utf-8")) - query = unquote_plus(query) + query = unquote_plus(query_data.decode("utf-8")) except UnicodeDecodeError: query = handle_binary(query_data) @@ -104,6 +95,8 @@ def handle_binary(query_data): else: values = [] for part in parser: + if part is None: + continue values.append((part.name, part.value)) query = urlencode(values, True) @@ -111,13 +104,13 @@ def handle_binary(query_data): elif mime.startswith("application/json"): try: query = json_parse(query_data) - except Exception as e: + except Exception: if query_data: try: sys.stderr.write( "Error parsing: " + query_data.decode("utf-8") + "\n" ) - except: + except Exception: # noqa: S110 # nosec B110 pass query = "" @@ -125,11 +118,14 @@ def handle_binary(query_data): elif mime.startswith("text/plain"): try: query = json_parse(query_data) - except Exception as e: + except Exception: query = handle_binary(query_data) - elif mime.startswith("application/x-amf"): - query = amf_parse(query_data) + # Remove AMF parsing, we do not really need it in warc2zim and AMF library is not + # maintained at all + # elif mime.startswith("application/x-amf"): + # query = amf_parse(query_data) + else: query = handle_binary(query_data) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 2ff6d2c3..8b84b34a 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -34,7 +34,6 @@ # from zimscraperlib import getLogger from bs4 import BeautifulSoup -from cdxj_indexer import buffering_record_iter, iter_file_or_dir from dateutil import parser from jinja2 import Environment, PackageLoader from warcio import ArchiveIterator @@ -51,6 +50,7 @@ from zimscraperlib.zim import metadata from zimscraperlib.zim.creator import Creator +from warc2zim.cdxj_indexer import buffering_record_iter, iter_file_or_dir from warc2zim.constants import logger from warc2zim.icon_finder import Icon, get_sorted_icons, icons_in_html from warc2zim.items import StaticArticle, StaticFile, WARCPayloadItem diff --git a/tests/cdxj_indexer/data/bad.arc b/tests/cdxj_indexer/data/bad.arc deleted file mode 100644 index 9de41600..00000000 --- a/tests/cdxj_indexer/data/bad.arc +++ /dev/null @@ -1,11 +0,0 @@ -filedesc://bad.arc.gz 127.0.0.1 20140301000000 text/plain -1 -1 0 Bad Capture -URL IP-address Archive-date Content-type Archive-length - -http://example.com/ 93.184.216.119 201404010000000000 text/html -1 - -http://example.com/ 127.0.0.1 20140102000000 text/plain 1 - - -http://example.com/ 93.184.216.119 201404010000000000 text/html abc - diff --git a/tests/cdxj_indexer/data/cc.warc.gz b/tests/cdxj_indexer/data/cc.warc.gz deleted file mode 100644 index 6e63fac27b576a7593896e0315f5956f1c999d07..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6022 zcmV;17kTI(iwFP!000000Bw+6YuhjshVKpj54vo-9P1-d?Wj#5OG_KrMF=i|-YJso z7;MRr6_)MS&u$X7g(46~uMS6#p3Qu{z%(IowCO9zf3-#+Z}tb%g>&{SL!U81Qy?iY zmJ=d45tM$M8`efSuVMKt(Br{daX3_3WEtBMPB}~o=P=RKfFwqPNhKpZ8#6uCPjR$x zHW(Y=hp}bY4bP;1!xN2c+qzORRLl1#_*v={kWZnItSr7)=PSvW9YRt_2^%Z$07N4yiS|?VLv0u zbaaRBKC5i_&anOfohJF{C;|WgABzY800000007NAYg5}uwx5Tp{D($U8|;c(Z$H7L z2#`s3lfVL5W+q8dEvan_WJx2nFy4g!eowa~Tb2#B!I`~Pld;t5KJPxSmbg4PJ5)4T zeX(^3SNLKa(hiDf90q<&v%w?6U{F`JCRUqR*DqAHQ&&5h_GdQXjM_ou;P*!zw3kGF zCrMnVW7=w)SXLX?4adS}Q!{bfrVduAMRnEE8+N_7|6=Pf@EP?P{y_ac8*ns@R((30 z5=6q#b8W)hz*h#0g}Z1N_Zd9umPoxMk?s1uphQcf*K@3%-Nag51zL=D9TUfC;wFK6 z%g~HwOWn^v0A@RhB7r<80%^5$ORZ~l+*a##Y&sgnLdM;)X^(n)j&)#0PY$BKl1)7&V=o{MK63js zW*rm{h}Jp#{m=IwEzvbvnosiplwMKrU|21-5g- zQVlVB$3Qhh@5R=Ot#=m}r(jeXC{slz|9Y`?PNQ2Ib&%}^iPQ5)M6us@{X5X}TYPr# zM+X@#U2im74btivP0gy~ld~gX1wOU8+QnL~=F}L^39BJpNB>MbVVEd4N%Y&t3jy_z z*aM2s8DR5@aN`1+t1G+bfbP3a zL)YsqwQXwDu^XD^7%gfO%cg3JQj^*a7#0|=36FLWy$fAXefK=^IguEM#9C{H#9b5; zHs~t5$O(qT^}BLO{k!NL*u{zOjbCj2heYEJI-mZ{T{4}-w`mWs#&SOh`W_{r8*{}eb{xO%k)i92 zyBC1h^W0?k>XKSgiZ5nkkH!N^S&0tNqebkS5+P+3-3`4cwB>LR&Mp@^vlE6~5QX@k zz68=J#F^9-HlRZq7iTE$u&?yUEe9lret!np&Xk881RG*!;D#e``Y{#{Fi1e(Ns;e7 z3Aq3xb3uZq=$dj<26)27?+6owCp&0J9hY=>i0ADn4^p=YB9@|NF7xRAp;(I!`Qkk# z9V?RM_%RNF9LnrLPS|1-D)F;;VJwe06SH$A;xQWe;1;EnXsH-2qd0yA_5+VNab2lM zF?t!1zmi}Vy`|JyVmUpSqDY%Y%m+ag@hTM9+1vstu^qV~%a99IU6Wgqj)L|6^2;wH z*LQ-Ed^HN`Ft~QXw=>vtG3ui4l0{?w7n`Lf#kc(@frE7d82lcm@ZgHy2T5d8sqqQI^^bba}Hy2T9X{(hIsX}bsUp6jhbYF5pze$Vfb zQz6ywwXFx7}M$ep&e z-FW1J8CPy(8;FP?S!5Nprq-_I)K_221M2n%tXjK~xf0(1pdTia5j&5nX&73qQCE$2tpzz@jdwi!y;_+h-e9s1N_kY+^+5?HcE3F zwY!#VM-=wTu}8VLs7RSN04d0EWOqr;mL2e`QP-9i`;D3;*9Kj6 zcd+MX#=Sdu^{P6B^2Pl!`1Y;qRQPWZygY^1T{!ykWf@8>r4|&#Z(##1TsNg|G2H3% zc??2>&jYz#n8C>fmXA(8o`GJ+-E9r-B7S)YF`j2--HA^GPg9r(VqJ*o$B#;XcFhd9 zO7$L03(C&pTQG|cZVX8Xjb2yg$0`ya%d+qfP+~!71nb(?sx`7;E-%8e=haRHW-tbI zj+j)D0zZ3P<+x{`20r}<2p3FwsWrfen?qKs*CbO#5>U|{*~vh?cLK&NeYWB9 z>^V|PYac8Mp;@6VE3Z!=^1eDJe%vutwbp`vP55WPKgbWCK@lUCM?>DQ!;l`3#AEA; zEc6oC#<4PmBQ%=9n%MxF++0d%>a`vqEOkzJk$vpTVm<~k*}xlPK`R0wddbko#&F_hBKCqO}-;+6`@oy==wFc3@@B`mC%^DKoaBp&-V%wbVN3&Y@gV6c+8h8@%` z;PrykpyV(LSOC5hy+#us635;S>Y&_SN2AeB6}>_`SQDa{s5(ydTBxc{vN6nK1%Yj_{I&f*p+dF zq3j#y`|ejxm9&2j&k+l9uePTUSPg{a>r^`h9+4lL}JYaBQAO!kCui`>PdWdPjKu zigSLlnOGd_#3At~a?7N(e<6UPDQjRjFt0S7AC6#vDD7|Pv6Ax3kHED3sX8;dt%&%` zJg8JUh9CeOU;${5d8Qu7B&A2CGF4zYgi2HLdy#(SH}p=MOaPV=PvC*woZv8Wp|Twp z)tIk>ADHa9)5$JuD%do2tudKD8AsC@+&*v^#OzAAz3GF%x~4W0JF+l3AaPnLg5N+T zJjjWH5XyG2Uyym0vb=Ho=+w1Y5|xhz%ObbjJ`U5F%7o+`5NlA@zv4Oxv~ z5gt+O1YosO2&v6_?n9ikHqA+Ej*~xv1Ra7h)@NrNBp7nQ0#8#|%NT2Knz43~v9|~R zj@RI(7f~oKdqhCY-x%<5fguBDNdJwNfZDqv2%UXcCOFZ z`EeT6&}kI#`O9p6!cGk z-A;c~{?itbv|m8c(zRxWV;v{8Q7t~=q|}8@>c9#TwnW2(&so`}&O z^jLf>F_FZk>_NE0L*j!#G!ZT1#Nxd!i@r{}Ds5|8&$cw(>9y)=yGLxzXplNdo2I~H zt5I*)O@lzU>S^uW1O1)XKZbrZWKQHr}xq;j10-VI650bB5#)Y|QF3g$yjYh*5 z1w9~SnISSh6$d?FfSY4+ zDz&lFc4pze)M?5ZoEW?o5$}!@*H0w|7n4~r@PgBk%#rbH-caRlU8HBP5*JF^%q_J! zUH%EdL#@w6bFZ#=hu%so3qU8aw9id+%Gd1FkVHWFB}G;OfdLU8^ahYQbhtr~^7<&E zK%5Ji{+L_KUJMKU{{DzsC_TdEWdT3-6vK$|wuB99EjpP?Y@5b0P2r2C{eR)c8pHKC z@abf+{4k*fp5{g{Z;VL$e+0?njH1GIw&oKBLUJ<4ltf}j^ddSX3UN+_VTzMt7N7z3 z!pWlag(iM7j7CA^Igc_UUS90EchY|0;r^aG?2GeRX@$4udCVhG6pX~Hz$Gq4Kon1> z7>Yl$&2i>JMLO-Cq$vh(?(c!6~kCYKdT=Cae)2@6`;vi=XEysJSLTb57@hP!gx#ON#u($uhH?D;_ENn^ZR7nafjV@|S_We<1P$9$H0+ zbz$qHeh~0CtzZMxkT?l^y{Ev`t-UaZ0vmy&koHf-2PY^c0iZZ!TtjgHfmHoEsO9f+t;-ygW&)$Ipf~ z-8eHE>rwx-lU!rd-2V;AYlW+06Qyo7rru z|GUl1kuO!t^z?XgAiU-_HrpF+;B$bh)BVKbT(iC5BNPsBb*i6yYHPMPd}zf1u1@z; z&S={k?o)DrPqmM=j$5DaZj1-rjj@WmG1kr-r( z1zUQ`39Yf|w?&Nhs)Rmtj=4~tn?3^9R`4d}_2lpLHPUj zpSeN+njER-r=;s{Ky%Zl_2vq}eyxMfy18i|VXojK);@waTPwU>xYl`W_CDzU_rq6n z(=Nwc!R0*fq}AN?n{wtVnfana)sx2#Raf$tSe>0#rR-(ke%5R>4f8`IVCtt^3mp_i5 ze8Z{!Ey$-Ix#XC2Rnt{nZTv0hC*LYT#sL19e_vFf!%#J?{&PN}6@L%OqR!mkL7G9Q z@bAO;ulRd){BPL(A1KwN~GpFc1Lm2LFS9U}HV($QC6~ zl1rey1eZW>tCwSlEu+YU{(Y6!rKRjeJBXP*KcDs`-H<#xrx$b|vz5hYBN~jj!UM(x z8KDA*0LD5(WKATc@9T$?)jk+F9%S*fgh?*TV5ByTa@Q!JRtczZ+d!l03LI@U5ooy) z)h^HWeu{RA@M5PvULKiW_I(Y`^L5Y|2S1f}3>w^EujaP4&xs6^^Z)8w=Ztb%0Ii9F zinleOF#-gpU2)1qLw2ho<@6H5SJD5KRGPG?R0}mXNg`+kt+ob;RhALOwz_sdk`6;} zBfOoCvbe-J-(*>8KaR;A@P6I+(3g3ZXU;}_>4wq1-ef^Jd$7lR0_)~Kj)?&P0RQZ= A4FCWD diff --git a/tests/cdxj_indexer/data/example.arc b/tests/cdxj_indexer/data/example.arc deleted file mode 100644 index 2024e9c2..00000000 --- a/tests/cdxj_indexer/data/example.arc +++ /dev/null @@ -1,69 +0,0 @@ -filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75 -1 0 LiveWeb Capture -URL IP-address Archive-date Content-type Archive-length - -http://example.com/ 93.184.216.119 20140216050221 text/html 1591 -HTTP/1.1 200 OK -Accept-Ranges: bytes -Cache-Control: max-age=604800 -Content-Type: text/html -Date: Sun, 16 Feb 2014 05:02:20 GMT -Etag: "359670651" -Expires: Sun, 23 Feb 2014 05:02:20 GMT -Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT -Server: ECS (sjc/4FCE) -X-Cache: HIT -x-ec-custom-error: 1 -Content-Length: 1270 - - - - - Example Domain - - - - - - - - -
-

Example Domain

-

This domain is established to be used for illustrative examples in documents. You may use this - domain in examples without prior coordination or asking for permission.

-

More information...

-
- - - diff --git a/tests/cdxj_indexer/data/example.warc.gz b/tests/cdxj_indexer/data/example.warc.gz deleted file mode 100644 index 4cfe512e2e35593e60353376a5a0572bab90f3d1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3829 zcmaLXhd&gK!v}D)va+*B*^w<=WJg_eHp$2qXB&+E?gW#_a(Hq9=0=twcdlo2u-PG)mb!)O{@_wKm z56j7wTeGTS>MO@7;QW@;h4e3X=3zhf?ww&%FOy1LZzToI(0kMz(0(M!vDW)U%EZFd zh3rl4o!~a}X(y`n=4(UTg?hWg0PIhT+$mM_CSPc!9Y}YFsWmF{s9rHxWwd z+Ymq4Rc)9(wBnPLU})^UX1YtCu>)=cKz--@MIG~tT-dc;)TTiNT|t&?RJJbT)RYyJ zlXm`SPSp1?gpvSPk1l&{-JRjHAx76Z+)XbR3u42`?P<+P;U9u;l{xqEB3u3T0^ylK zpE!t2r?&C0mHZpcvBMs=GxBnN#SCWWVcv_eOnOgU%=2zubdUNwv$d*)(cNIU(CG!i zBCY!@uRl2K;{!fFWZZyAv!8CQ#HBb#4o>8IQ| z=?F5HW1WzIfa_lc*XoYGniHo9kUiykd)qDH-auy+33jtD2<+*8bHxGyJn`L$jJSMBx6>*qEBuYFevo(l*J zMl8GyWiM}^7K3q{X*v?G?Z2%~b~P)_KZ|updbPXA#(wQxc*n4c@JvoPtH_N5cI{)6 zD*`sM#5FrphRYai0dJM7%{z^?ZUM;R^wd?q6dEX&T9^5=qS038DDX|d0k2pph4B%p zrz3eGD{r)>6*oU)!ApjZD0h=6_excF(_Y{ptD5A{^8N&1FI7|gbqqX|1Y{&$Eq?tj z2?$U+4W@ddl;tYB(oOW|T;6)lAbMkF(gl(_Ij<<4Z_w+Ng$=f{P2bzeQ)r03(9j=* z<*t_^fOJ%k=P=VGU?D~>8nw*gvNC-Dyw1pl);>V|NJh)-kw(H@z&Id^PV<3*#s^ho z3iQp(^K^XE$;Hsw&e>37IHW~22ALgS2$H+~D7}`5Qvzqm;Iiy1%Xf=d%zOA{n1+I5$C|bQJCDpalcmMp(TaPmi zqdMvo!$2&j42mRC6Q}HruSN0-y&xkSw4!6sKn`>g8vxW@D{tt_lk0TpqYUn7ge3^c z(e60Nzxe9MJL^iy{;VSdhAwnzF-SrtIaPa<>4=VF3n@|&Rb=5F;YICmL|%ME8ej(N zKAI8?@wD^xgNXXR`KF~)jz-5sNZQ_7NtZj=H z$gN=X8e*ysRE3wtc=(ym+MQOxovcMp@i*vJSzv(Mq)+!4!fS&OrCySv(HJMBm2)G8 zDsqLAcupa=ATta^Zn3#cD3I-hz7|jO&RbrvA?El_4Wn(mU_w;xRZ1VRnAp0oXL}6H zUcTMo^{E~&Bb_49a2X;Rs!KOHvh})iPrc5)9DO4au5j=D<94LeCrUn~2}H{{40Q5& z+eK?kj;re`?&C90gqN!=V06Zp(I)n$Vdkq7w-?IBu0;670_g2^Vkm1T8&e=StG+!}vFCGe2b3*PXA* zUvoO#A8z8yWLOkK%Lxh8zCn`FX47}^qS*nk z@2Ni)eAwki-y_Osn50?tGtk+LQB!HG8w`rQe_eQ@7KhE7ftXe7rf#>%A(57iG)ZYK zi8|vMAxj7E9aZ4@85jfVJmKwJGZ|;O$Na>#r!6$ zg<_mj4daHcMx;J~S_bRyR@$2Tnd7;eHajCfMR>x%_M0WIZq36-UwHRf=DLuVAM02= zKQaf7ZaL-&i5rYnL;B~^4nJGVVkqV!R}zCs#Vq_FJY{37%@{MtS!x!(d0gGmBZ2XU6(V)o8CYLT$Usa)5=s1y2=%+QJSnb_9!;!|7uPyY6)2Z4xy zh5|u3)M5MjmYSGL5gy`)7brNoi(h0Yk{c*2bzm5>Myi6PfWcwb^uP6>9 z3Lk8cOx%;c=FMgOy!PQIwvR@?EN?^meceei5&3Mt{k zg^;|Ia_uxbL_W*qc{4nUnKvlwCfUprQ<|<2j=?AfFSOLMd+{@*-^~3o0>lPqu>I#R zHIXg#?A4#_m}K!xRn7L&#Hgw${7lem`G{n29skLG@AMc>h8P9__M%}kK?_nQ=# z4OHwXk)Fi&K!EYH9yg7$*8Tf&tS3n(nY@K_deL%fQ=D=+S9|VKl|4(lzT%d#w=f#h z2>+y#Zj6e5GUWKY0)o>F@={ED8qp#kws9?PG?^nJlnE0zr;7qr)S1;!N!ez1xE$BS zIk!8X(e^+~2ACQ8MF*Fo(8Zd)`<{g=LHkgr4QzU0dgOwu_Df=-FArn&R`;(P$HCo)q3XQ`Ed~iE$u%!eZvF=Gr@%fsR?2a&kKu^bE+>=4!GC?*sLi$S{0SD|(2UjsT_!NFKJVf; zRI*fCO$W==(0NZEol#q0;Z*h>YVB?BLcz{eo%7`O|d%@gSH}NU-2B^39w9whM zVM$-!K8#S&xeYO=l&}xf+RqjTE+FNhw5lBiJ7H({Vxm+l^KV!e0h z8?%q;^*%aeKL{f`%x5fpK-K-Y3uqtrR7O1M%Ji!+zJjrlBs@RYS*V6(wpM-fvn086 zYNwMMo&c&{G&qW^)CG>BjL}r|$a%WOaJAX0GSd_ctK0Nf6)W1dALu)&q)dE-tJD^= zMsRHDq&Twk1JB~7ZZ$gFsnrg(DricJ%DYZa*+iM3q`dBvmd;T#T3_O_`Q7YJw||Ig zJjnZiY)I+$2mUlLOh#r|tl5hBSeJRxr!0}8I=Ha$l}5(t^aayZ#C=wNbT^L!DItCL zV9o9K$9B=jA)5*90v3BQ7PW!GMCd8uy@>J74q{n9XEW#GSQ4+};iBnDMFn30tNX+F zr?ToxPWq^3aCk^_BA8-$_!&J@Jf!M>9qIo_6@>gq^M_Q{|5hs5O~f>6rc7(_=h^>A zMfqE*KxyuE2R?|NVPAR(wdg#-w|lc_0Geh!lS=LtZU+8xwyJb-3X3%O!hKZm{e5;B zoK!-4dbCul!%9Jly5O2YaRHd(`e)?MJ~jm?@~$EC+miA}Y$+WPq-gj|R!nEgrpH2X zr}Y0zH8?XN|F=}pe@W%@k5t|_jW}_?rFzcATM2j!YD9jJzcj@*NU2AWe@g|!c%Te^OC@XUX`8zgfNzL% z?)^uqoTE&%W0c8=%z9ok2_tLavkB#UWSvyg>?W`*?vw*Mcs5DSiFG(4o9s$!44$6# zGAD`5?y;zEqoO7cqh^!=q2gP|P}EV(zSKhv#otoxN$PyU$%ihTuG=bPJlm0m^H+qp zhO`{$Pb`pUh2qX4?553UWdQ$KD)axAD(x?+CbM11f{~udQWiG6f_N=*s!?_?Paz4c4jnZH}DLk!|~|++`#eC2&T27Yh>Mt*O9lSP#_9?rd^e4I%XKv zy+rzt>@!|hQqSexsB&2wJ$6jngcwY0{i`Vlff-j%RTCl`Kmss4BE+!6kdqF8zXxtn zJZ(FsA&crJCAtN5M63bW^T$o{0?evXbc$)+l_R!rgM)WkB#9lRbVt*5I1i$D6-CSW zazp&x@EeF+wgvEKOnd9wn)Q8EFv2sQ;Naj0#IAv3%WpR3G;nC@?@<< zt-waqrMfch)zkDQ^Y8y@uuset4z2BWaWFr_Q*6iSI&qWGi|JvSe2hnb^9!hHf5~zI z001A02mk;80006Lm5|F$!!Qs<_elIheqa*6lGGMLY7-&KqoNOJ)1B=&X`qcuYA?#y z6GSSpMY9==?mcsL*W*PN;SgP4t@{mD=c5rpXFk0-xywyu$s5#U) z{IXujwy~kMhp5rEjXdQfC5XZ(4d0LT0LZ25p#ZU@N1Xxo)A&Hn;Sl_08O3W{(A z3=;z*t|<^DHReolLRb&z1Gt~>Po}NiP6U@aUgUXiWeIsx_kD=qw3B`GDI@oEnapN) obMm}=+oYTAV=*q4*(RP(iig2F*<1bbU+3?914vqWPL%-w012%d6aWAK diff --git a/tests/cdxj_indexer/data/post-test-more.warc b/tests/cdxj_indexer/data/post-test-more.warc deleted file mode 100644 index 34068952f00851d8536c0dd1c570fee1e690282d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3426 zcmY+{_dgU40|#)Od6X^WN@OOS$X*$7hj3?iNN0;9=j=Vg+53!SmQ!|Ci0q6cqq}p7 z>`ge7Jm2S+=kLB*&l%iyTZ#A%#+7LKHm!zP|ibva@(76aqUHIf1>x>1qFVT*YLa@ zmRA%O|H%tt(b}t8kb9ah$dvBJ^CAJ1N5`X0f;^2U3q|Ij^83&kF^=o2MYmU!*!IGd znR{UIcxFG_(99_Jq}8B)_xcLvx->dNN5S}z_^59TT7!&|zugS-e)tW0)6wfbbZ3hS zN=Tl-uCVm(FYUDR6LNhnoc9!q)d)4d>M5T)stL>TlsAs+sTAC5-o z;n$Mma()GbD;&@%{3P`1wtRIlW4+y6)?;pdRs1S#nJ1IzWAjxltykf3%_FUXa@-@V z0WZkgXq7U>7el6e4JtHV6|anH8{=+Jb4I)#CUy1HAorG)|JtAK-YetBDm9bo#35V~ zG#U(5H>I#H>rWq?$Sw#B@!1R&!N?3vnvxn&IBeD=!N=m`>pKMLfDvp*8l&Dt_wPc- z26W_zGEJ0%j^8Qq(o0c_Ek$eZ40Cx(rWY`RtZh+}O3Y}gQx&`G^aRo>_=ZPx-Uc(0 z^)js*raxvu(-fbCa^g;^WyaqDjx%VmT2xlc|JK(aN_dre>%()nn@wZST#iVD}<}&zqR(BbdP6xsZFV|MSS}BHvbgoQZzE;cjF`17(oIw;nNl^7MZkZB zjOmc-53kwnj9zRfQ+SQ$*)FW zHL!_*&O&h0Y`YlFX?k9aL$V51orZcNFdJtK6wX6~fm$k$J)bM>9M6GQ2N4G}ER9oQ zj<4r@3)E8ACO%vo$OZChs2lbUe!*O-_G>{M(-`tUtBQ^CxmKJQ43dzLf$D1|GKEe8qp6pp&JLM|+ zX*pkmx`XllLFrIq-lkz9oY@vF2KRgsgzB-UbG`}dbQxMAJ(c78^~yZ{VPJ_5&-mEF zmNW8>hp=U&Jl`fe=guo*X%LC`oU!+DB@Fud* zKq|AAzs4YL*zWz#TXydE4sUXWoLhFnkM!v*99)KFc_7GOzJtx$YmxF$2VGU=+2EDU>bJBlS(mY8fYc4C~?Il)q^U zcex$qYyIs7-t0Z+s}6PP4eYJtnO$XR$$>^pe~hXoy$=2og7mJ09F3{-25j9*>Oj$V zXQ^@~33;cvnCV~ci&Jb3S7@^vUzzO{W;{QgHVOx`to~eB-d>yJvr9gEJ{Q)r$d8h( zbt&5SP89LCA1tdroucfP63R6aD$m6dr7AOXe8!;=gTS;nGxPL8os~Cp@Dr+L!OYv} zV?mXnqFp0$z1@a+#w z7G0c0Rxbj7-9LD$9Tz4W}K1dD(!Dt zQPt1S%ompU_kAC7Oe>4Vc55EwyXP3wQB@LKz%l{EB7moe4!cvnu>`zM(sq5~lhBzh zE}w2HKD&*C`wVomo0z4A4YbsbUJ(W^2S?n^)=ssAkQd8C4-jShAF+3xA$HEng=#C) z;`**OV%vN_9$mRNzTL^RLAy3lEa4h~^y|O2x6x4WE%_xV4u>yen~t6nn=JhMW?DG0 z*`sZ!fp1Upa!aZ(!kZn=Bdm-Z$W!%wSu9ARu*Qnls{kQ0>X2ZJbfs5^1&X}@|M7nL z`O{=A8_K{W-=sO;`dDW$C1dosdNxn0p)ke6FF41;pLw$?&`0R3#hxICiBpfAUz+@j zK>SOpP}4`K0x)No-lB$8;F}fidXDb*`Vd!Y+v$lcA%pp(dZ33p9GuyJzX)X&HkU9D zu7B$r(nTT|=E#ol5-aQ1?Z9egg%SB0EgfH85i?}XSHaA{lbM-}CQls40+KDDP}@Ep z^kjDbu}Fu^sG=`G;I6{pX{xN)Tc(-?cV3RVukG&fPDlP0pGxpd)i4x7VEXz8kN5IT zp|12*9BFKV3>t;%LWgyha?c&C(TqS>%#?se%lb1lLf@C5#sm(3?AL%K)b2iLtOhlZ zyWobY6!O^2H@E^n`feDZ$PO~ibI8B1ss(_>u=W`yA(l=fyT21g(Y4=x>YdsDVvw#O zFL`y@h+1M#j@?zR^q$H=kxOcGl163789gGhqX~a3T-I)Okk@OuH^tfR5gN|)SJk`M zsl?w_u*T(DW%`{Rxc4ZNj$1w&>uY!m)|ufSnLmJExL3$|9dK#-{%%%`=-R-u=TaLK zHHRe2Yj)7e1K3p>&hU{9rlGcP{sxK^U~8RJN|2CC=M%sUrP`b1-0%}oNIa!QA z3gDI7NMK!^ZtcyCvF)6FYU zF=KkD910W9xOrg*v&(-u0x85%Zmf=(w0sGgdM7#QMc&{@DVAVJr?U1DaVSP;f1>Tr z^oduWjf(ZNok%;oG;t`B-k-(38JJ4)RTq_FAKlN;?W_{`+QSq=E*K@~KP zTyv_DkWXbq)wyd1{x(+5l-4CV*4S0Tx5B{X!*k5z(THVtj?x$WO^>!`%t6ve zuRXU@UiLSUiGfAFn8c@1rOK#~vS=h)DfPnt+<%x*oE!LXLWsAVj;mBJf>G3Izg*=# z-9C+=PT&}0Cw%4|wFG!1b&|8Z?}6bSyPnf<_+qTsftEz!pE7lJ&7)~u*e6zT(`cVl zqSos&ilmA_P^gNoNO=Mj{VhF&X}!`pmB;Fn_(Wx0ucIz-SBaOKKAcOs!9Mba)K5n15!D%`KRol6>2h~u5hf*%5zOxcCbiS!RBo|7p7Ufk= zr_*+f!cR96z2j*Aygnjj*bY8JU_m|pio`DPn^<@^hdYnQ_%ts0d{kI-j!I=oAkuCn zpifwB{`%#+MG&>xGOlA;Fw?A=MeedxPXR6OyDkrc+TaGuo#%(g9B*|4y-!;BYrY0~ z;9}1K1@K`iM*PC9LCd&y5i&W6 z^YWe?`M$2WwsnEeIy;MKYM~P%gP{W=7b*Yy-whUf%>r8C+Id1(_ zI)ObHT(aN;K8&kDB0eHQKyg7#4~E+!`Yx|bR7l18j@+7Xz(x?8;V1oz>W1GKo}Ctl zU1U7au=@z_!Z!AvH7Nyp1t5OAot>Zfa*G(ZSLQA2s1UHE<|<&$yC0b>#XTPO(Co7_ zuRooS2Lr?AAnRoN(58+asArc9G57I+EFvbt3a{ z#vz@^ii8W_KHqmgzJGeZ{)N};@qE0lKc=TIxSeBh@jTeo2_+>5mQ?^lWWm1!RfH*k zq4ElfQnE@gC75)uBg$FYJyhD)&(()2+~T1r5zKnVFMj@Z!yuNO+^y)0C`rkwZ^2@j zdtglkZqzDxrZ~Yf$~OhmCms3t@cjWNQ~!Q+VR%%@VC1#cP?PvFpZ2&Z6VGY z2u!8r3a@5L-Bd0|uMwj-^dt?<9kdAk4BtxM#Kb~-x7%ADr&C%lg9WlvJcaRb0|{3d z+1^z>?p|E5y^9d3i++4e9}|dNn^BmOfwv1L_YUjXv<{2`4#l%+a=5HPVDDR))UnNt zxoQ%}Dw23q#TUdWb#uY8(+h1}z5!7jG)UN1U=R#Dw*s z44V%!s^{)2P${EgUFl+;5_F#H!X<=0n7B*UkEd)ESe2s9O2d^h&1MpTwy*9WGu6w5 z&F3@nryqW#chx)IuC^ZHAw`8W1}u6mbEnZ5hm;Nv?kH$bWB-;L#4{f)X1(jp18K}0 z8kz;+&+%0=zC-h?=L|j&o+{dz7GJ@bw%zHZgTMc&+^aghvd4XZw16%I|cD>gT?YrHHN8t66 z#6IS`G^woyxH*BZeVaFl61uox2?UAzYBOL7;ywB>L6QkA`Q!Ri;+XwZ=C2D>ROjcH zsMI}LVV=}fRM-F8o?SBT`%Zv?AC`pIRF0Aol8DgrZ_~ntUMfj%8SUsLcE3PV*`*%G zY*xjXrm?p=Uom|d-Y^1tz53p`hv6N~VRuZY)?QdTfcw}BYY>LPmw&2=?v&Em(VxKi zdrRijL&arnqCtAp45C0w53Dxtsz|+1vQ>1C29tMtU@03j`x|QQdk(|9<3_)_UZGx; z#rYod={#w@#1#D%M8uRlVWd6HbbS#fdrdfuDGpOqDaLjg-yF|4!I}wOZtPW{8FA+S zJJU1CA2o>EZb8RBtmPHf>5otFezuMJzMGN$W2enT)+p^Ehf!qiGbyl_u;?`fPbKu+80lOH zQ4;Ctk4ulCnPswT?RGLe^+;%|$_ylLbPaQNFxH3rs=_bbgI$ilr(vhb$yTJ^I>x?I zD|bWwbVfOrFEXA-6qoA|{^W2TVzVoge9E%VZBofTIj0&GkQ*@Qsa?}L0{2<8?l0Oy z-1d$9)9s8-rHIwVf4Vg={uircWff(i3J{2_{J&d8MEtfY;^*x%tt71`)hFZ10*g?}c0+o;w$}X8zbls;Qha;(q^fB;Rv>tM8(tp@J2X4!elvUe22d3!Ee7VPIjnHh| za$M2}HLAs4v$Pt74|q;c<9~BylP%HC81&0bEc<0)*QU=vvHS<9sxiB{B@oLLzb;T1 z^g?TfhcdDM7368Ef1mEf-gjo zoRRsAu$_h&jBmA6tN9?PDSd>%Pf@?I4BHK1Rjg|Q@#|@6A@FxhZIsssZoa2%)7!M& z)bo}z^OMnKw#iXvw4gku(X<$C^hWG{52Xiuu5-EuDe!`t$~XBI*RH(PJZ&&1g2(b} z9Q)t+E(O2xYFwIhSnXy_d0aZr^!&q+q>hRoa}Z|J3N7aR!6J_<;Jf5UmJ6KwxAFTq z4ut&W-aLu(#x0K~iAf6nLA=9liZePIzjh^G-_g$H9Z>M7xU$34O8wFyH8BagcZr}z zt79o;QpoprW$;q^;kKiDr?{S@>RK|}rijUdu#&B9uC^0&)&7vr&~_5$oQ-l1qxN;HW17i1Nn`T4 z+u&w}x%3ZL3ZIK^q5g37A13XRVdq-_=8D_NIAZD9&}&t47&cf+b3_bNB( z1owqk0puf`xsC(;MgLWW8O3S$9X?HY8iS@}PU zu#%o?4H|Nv*keX;-lCCFWz_&-JxadgNnu$h>5EssnzYi`>HCE?;(}Q6_{X`(E>oXy zC)++7)0Y-Hbvwj(`GocU8)OH!xh~c|=#kI%?ItH;*WfvX@L53Eswyl)0N{O{V0Xynh=&3s^IA zK>!=&O$4wZv2+pf>d1BnWVWui7VOnL^Iqw)Oyg>+_Iyiq%)>MVL!4uRda%Y0^-0&A z8i+Y~4=j2?FlBBykW6V*O)Kf082qaQY2!BBeBhozZ}Fumd$9CZGq$;#bUfk=?kfTz zlCA>ADx6PVdZ@fKvbF!XZe$(}3jVNr^Hx^+(U3xwku4;TLf>d>R+|D$BQRJYkqm2G31Axnu{Fi7Y8VA23t3mx zNYSl$8RfHAG^1nqerT>PM&@pJ>!)^}#}YObr&~!9rdnjGv5d+tQ;53GS*(g|3lG1b z`-5RTlBoPidTPUKd<@3h=y9R_5Xhrn6>Z()K@o$2wL<^ZwaQ?rrkrf$OuTwN ziA7=x;I>m%qVy+Ge_JXNU!cuS8j2m6?{gW23%olRYz(jZT8#@RTHr!nq}c*&)% z{$HRQB*N+)alqbZQ}w4isO^T}q*dv}hYLKhxM$rChDl^|K81GltEeBmLQ$Q=p7-|)i^ZQ@A4GJ>Srw>rO8qeZxX_oy7 z{kqe!YP!QcRbDM*LaMB$(4P4uLfo}5CjZ8j**$adF^|GQLI;Xdm!7}B{W`bZqrts$ z_e3>#IO=PdYV>ZM*A)CLaiT)bp8J}o->f4J?(;LajS_)6%EjDoxN^cuo$$l%M3Aw^ zd-|no?vJEHEbt(L==bT#Hx$PS1sI_lx<^xd_eO>YD|C0qkzRo|p{6DfDjRwX`7cvpA(UpMjxY6i{RZJ&cwO3ep zm6J7LHEVM3dy}jQWH-7&l8u5*mu#W2>mRuilxn Re*NcMv23Ip-21K7{{a{yq$mIY diff --git a/tests/cdxj_indexer/test_indexer.py b/tests/cdxj_indexer/test_indexer.py deleted file mode 100644 index 488ee306..00000000 --- a/tests/cdxj_indexer/test_indexer.py +++ /dev/null @@ -1,373 +0,0 @@ -import os -import sys -import tempfile -from io import BytesIO - -try: - from StringIO import StringIO -except ImportError: # pragma: no cover - from io import StringIO - -import pkg_resources -from cdxj_indexer.main import CDXJIndexer, main, write_cdx_index - -TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") - - -# ============================================================================ -class TestIndexing(object): - def index_file(self, filename, **opts): - output = StringIO() - write_cdx_index(output, os.path.join(TEST_DIR, filename), opts) - return output.getvalue() - - def index_all(self, filenames, **opts): - output = StringIO() - # paths = [os.path.join(TEST_DIR, filename) for filename in os.listdir(TEST_DIR)] - paths = [os.path.join(TEST_DIR, filename) for filename in filenames] - write_cdx_index(output, paths, opts) - return output.getvalue() - - def index_file_cli(self, filename, capsys, extra_params=None): - params = [os.path.join(TEST_DIR, filename)] - if extra_params: - params += extra_params - - res = main(params) - - return capsys.readouterr().out - - def test_warc_cdxj(self): - res = self.index_file("example.warc.gz") - exp = """\ -com,example)/ 20170306040206 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1242", "offset": "784", "filename": "example.warc.gz"} -com,example)/ 20170306040348 {"url": "http://example.com/", "mime": "warc/revisit", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "585", "offset": "2635", "filename": "example.warc.gz"} -""" - assert res == exp - - def test_warc_cdxj_cli_main(self, capsys): - res = self.index_file_cli("example.warc.gz", capsys) - exp = """\ -com,example)/ 20170306040206 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1242", "offset": "784", "filename": "example.warc.gz"} -com,example)/ 20170306040348 {"url": "http://example.com/", "mime": "warc/revisit", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "585", "offset": "2635", "filename": "example.warc.gz"} -""" - assert res == exp - - def test_warc_cdxj_with_record_digests(self): - res = self.index_file("example.warc.gz", digest_records=True, post_append=True) - exp = """\ -com,example)/ 20170306040206 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1242", "offset": "784", "filename": "example.warc.gz", "recordDigest": "sha256:1ebd93871e6de75fb72d5398452e4152cf3d655ae31368e7336c1b57870d244c"} -com,example)/ 20170306040348 {"url": "http://example.com/", "mime": "warc/revisit", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "585", "offset": "2635", "filename": "example.warc.gz", "recordDigest": "sha256:f67dfffc2d78d025030574b08ad57211d2d2211ae49d566adf7868072a823f74"} -""" - assert res == exp - - def test_warc_cdxj_cli_main_with_record_digests(self, capsys): - res = self.index_file_cli("example.warc.gz", capsys, ["-d", "-p"]) - exp = """\ -com,example)/ 20170306040206 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1242", "offset": "784", "filename": "example.warc.gz", "recordDigest": "sha256:1ebd93871e6de75fb72d5398452e4152cf3d655ae31368e7336c1b57870d244c"} -com,example)/ 20170306040348 {"url": "http://example.com/", "mime": "warc/revisit", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "585", "offset": "2635", "filename": "example.warc.gz", "recordDigest": "sha256:f67dfffc2d78d025030574b08ad57211d2d2211ae49d566adf7868072a823f74"} -""" - assert res == exp - - def test_warc_cdxj_sorted(self): - res = self.index_file("cc.warc.gz", sort=True) - exp = """\ -org,commoncrawl)/ 20170722005011 {"url": "https://commoncrawl.org/", "mime": "text/html", "status": "200", "digest": "sha1:RXZILWL37W7MAZTH76FEVIHSF2DZ5HTM", "length": "5357", "offset": "377", "filename": "cc.warc.gz"} -""" - assert res == exp - - def test_warc_cdxj_dir_root(self): - res = self.index_file("example.warc.gz", dir_root="./") - exp = """\ -com,example)/ 20170306040206 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1242", "offset": "784", "filename": "test/data/example.warc.gz"} -com,example)/ 20170306040348 {"url": "http://example.com/", "mime": "warc/revisit", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "585", "offset": "2635", "filename": "test/data/example.warc.gz"} -""" - assert res == exp - - def test_warc_cdx_11(self): - res = self.index_file("example.warc.gz", cdx11=True) - exp = """\ - CDX N b a m s k r M S V g -com,example)/ 20170306040206 http://example.com/ text/html 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - - 1242 784 example.warc.gz -com,example)/ 20170306040348 http://example.com/ warc/revisit 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - - 585 2635 example.warc.gz -""" - assert res == exp - - def test_warc_cdx_9(self): - res = self.index_file("example.warc.gz", cdx09=True) - exp = """\ - CDX N b a m s k r V g -com,example)/ 20170306040206 http://example.com/ text/html 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - 784 example.warc.gz -com,example)/ 20170306040348 http://example.com/ warc/revisit 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - 2635 example.warc.gz -""" - assert res == exp - - def test_warc_cdx_11_avoid_dupe_line(self): - res = self.index_file("", cdx11=True, sort=True) - lines = res.split("\n") - assert lines[0] == " CDX N b a m s k r M S V g" - assert lines[1] != " CDX N b a m s k r M S V g" - - def test_index_multiple_files(self): - res = self.index_all(["example.warc.gz", "post-test.warc.gz"]) - assert len(res.strip().split("\n")) == 5 - - def test_warc_request_only(self): - res = self.index_file("example.warc.gz", records="request", fields="method") - exp = """\ -com,example)/ 20170306040206 {"url": "http://example.com/", "digest": "sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "609", "offset": "2026", "filename": "example.warc.gz", "method": "GET"} -com,example)/ 20170306040348 {"url": "http://example.com/", "digest": "sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "609", "offset": "3220", "filename": "example.warc.gz", "method": "GET"} -""" - assert res == exp - - def test_warc_all_cdxj(self): - res = self.index_file("example.warc.gz", records="all") - exp = """\ -- 20170306040353 {"mime": "application/warc-fields", "length": "353", "offset": "0", "filename": "example.warc.gz"} -- 20170306040353 {"mime": "application/warc-fields", "length": "431", "offset": "353", "filename": "example.warc.gz"} -com,example)/ 20170306040206 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1242", "offset": "784", "filename": "example.warc.gz"} -com,example)/ 20170306040206 {"url": "http://example.com/", "digest": "sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "609", "offset": "2026", "filename": "example.warc.gz"} -com,example)/ 20170306040348 {"url": "http://example.com/", "mime": "warc/revisit", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "585", "offset": "2635", "filename": "example.warc.gz"} -com,example)/ 20170306040348 {"url": "http://example.com/", "digest": "sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "609", "offset": "3220", "filename": "example.warc.gz"} -""" - assert res == exp - - res = self.index_file("example.warc.gz", records="all", post_append=True) - assert res == exp - - def test_arc_cdxj(self): - res = self.index_file("example.arc") - exp = """\ -com,example)/ 20140216050221 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1656", "offset": "151", "filename": "example.arc"} -""" - assert res == exp - - def test_arc_bad_edgecase(self): - res = self.index_file("bad.arc", cdx11=True, post_append=True) - exp = """\ - CDX N b a m s k r M S V g -com,example)/ 20140401000000 http://example.com/ - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 67 134 bad.arc -com,example)/ 20140102000000 http://example.com/ - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 59 202 bad.arc -com,example)/ 20140401000000 http://example.com/ - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 68 262 bad.arc -""" - assert res == exp - - def test_warc_post_query_append(self): - res = self.index_file("post-test.warc.gz", post_append=True) - exp = """\ -org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "sha1:M532K5WS4GY2H4OVZO6HRPOP47A7KDWU", "length": "720", "offset": "0", "filename": "post-test.warc.gz", "requestBody": "foo=bar&test=abc", "method": "POST"} -org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "sha1:M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2", "length": "723", "offset": "1196", "filename": "post-test.warc.gz", "requestBody": "A=1&B=[]&C=3", "method": "POST"} -org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url": "http://httpbin.org/post?foo=bar", "mime": "application/json", "status": "200", "digest": "sha1:B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ", "length": "723", "offset": "2395", "filename": "post-test.warc.gz", "requestBody": "data=^", "method": "POST"} -""" - assert res == exp - - res = self.index_file("post-test.warc.gz") - exp = """\ -org,httpbin)/post 20140610000859 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "sha1:M532K5WS4GY2H4OVZO6HRPOP47A7KDWU", "length": "720", "offset": "0", "filename": "post-test.warc.gz"} -org,httpbin)/post 20140610001151 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "sha1:M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2", "length": "723", "offset": "1196", "filename": "post-test.warc.gz"} -org,httpbin)/post?foo=bar 20140610001255 {"url": "http://httpbin.org/post?foo=bar", "mime": "application/json", "status": "200", "digest": "sha1:B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ", "length": "723", "offset": "2395", "filename": "post-test.warc.gz"} -""" - assert res == exp - - def test_warc_post_query_append_multi_and_json(self): - res = self.index_file("post-test-more.warc", post_append=True) - exp = """\ -org,httpbin)/post?__wb_method=post&another=more^data&test=some+data 20200809195334 {"url": "https://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "sha1:7AWVEIPQMCA4KTCNDXWSZ465FITB7LSK", "length": "688", "offset": "0", "filename": "post-test-more.warc", "requestBody": "test=some+data&another=more%5Edata", "method": "POST"} -org,httpbin)/post?__wb_method=post&a=json-data 20200809195334 {"url": "https://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "sha1:BYOQWRSQFW3A5SNUBDSASHFLXGL4FNGB", "length": "655", "offset": "1227", "filename": "post-test-more.warc", "requestBody": "a=json-data", "method": "POST"} -org,httpbin)/post?__wb_method=post&__wb_post_data=c29tzwnodw5rlwvuy29kzwrkyxrh 20200810055049 {"url": "https://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "sha1:34LEADQD3MOBQ42FCO2WA5TUSEL5QOKP", "length": "628", "offset": "2338", "filename": "post-test-more.warc", "requestBody": "__wb_post_data=c29tZWNodW5rLWVuY29kZWRkYXRh", "method": "POST"} -""" - assert res == exp - - def test_warc_cdxj_compressed_1(self): - # specify file directly - with tempfile.TemporaryFile() as temp_fh: - res = self.index_file( - "", - sort=True, - post_append=True, - compress=temp_fh, - data_out_name="comp.cdxj.gz", - lines=11, - ) - - exp = """\ -!meta 0 {"format": "cdxj-gzip-1.0", "filename": "%s"} -com,example)/ 20140102000000 {"offset": 0, "length": 819} -org,httpbin)/post?__wb_method=post&another=more^data&test=some+data 20200809195334 {"offset": 819, "length": 420} -""" - assert res == exp % "comp.cdxj.gz" - - # specify named temp file, extension auto-added - with tempfile.NamedTemporaryFile() as temp_fh: - res = self.index_file( - "", sort=True, post_append=True, compress=temp_fh.name, lines=11 - ) - name = temp_fh.name - - assert res == exp % (name + ".cdxj.gz") - - # specify named temp file, with extension suffix - with tempfile.NamedTemporaryFile(suffix=".cdxj.gz") as temp2_fh: - res = self.index_file( - "", sort=True, post_append=True, compress=temp2_fh.name, lines=11 - ) - name = temp2_fh.name - - assert res == exp % name - - def test_warc_cdxj_compressed_2_with_digest(self): - # specify file directly - with tempfile.TemporaryFile() as temp_fh: - res = self.index_file( - "", - sort=True, - post_append=True, - compress=temp_fh, - data_out_name="comp_2.cdxj.gz", - lines=11, - max_sort_buff_size=1000, - digest_records=True, - ) - - lines = res.strip().split("\n") - - assert len(lines) == 3 - assert ( - lines[0] - == '!meta 0 {"format": "cdxj-gzip-1.0", "filename": "comp_2.cdxj.gz"}' - ) - assert lines[1].startswith( - 'com,example)/ 20140102000000 {"offset": 0, "length": 1319, "digest": "sha256:' - ) - assert lines[2].startswith( - 'org,httpbin)/post?__wb_method=post&another=more^data&test=some+data 20200809195334 {"offset": 1319, "length": 570, "digest": "sha256:' - ) - - # specify named temp file, extension auto-added - with tempfile.NamedTemporaryFile() as temp_fh: - res2 = self.index_file( - "", - sort=True, - post_append=True, - compress=temp_fh.name, - lines=11, - digest_records=True, - ) - name = temp_fh.name - - assert res2 == res.replace("comp_2", name) - - # specify named temp file, with extension suffix - with tempfile.NamedTemporaryFile(suffix=".cdxj.gz") as temp2_fh: - res3 = self.index_file( - "", - sort=True, - post_append=True, - compress=temp2_fh.name, - lines=11, - digest_records=True, - ) - name = temp2_fh.name - - assert res3 == res.replace("comp_2.cdxj.gz", name) - - def test_warc_index_add_custom_fields(self): - res = self.index_file("example.warc.gz", fields="method,referrer,http:date") - - exp = """\ -com,example)/ 20170306040206 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "1242", "offset": "784", "filename": "example.warc.gz", "method": "GET", "referrer": "https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/", "http:date": "Mon, 06 Mar 2017 04:02:06 GMT"} -com,example)/ 20170306040348 {"url": "http://example.com/", "mime": "warc/revisit", "status": "200", "digest": "sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK", "length": "585", "offset": "2635", "filename": "example.warc.gz", "http:date": "Mon, 06 Mar 2017 04:03:48 GMT"} -""" - assert res == exp - - def test_warc_index_custom_fields_1(self): - res = self.index_file( - "example.warc.gz", - records="response,request,revisit", - replace_fields="warc-type", - ) - - exp = """\ -com,example)/ 20170306040206 {"warc-type": "response"} -com,example)/ 20170306040206 {"warc-type": "request"} -com,example)/ 20170306040348 {"warc-type": "revisit"} -com,example)/ 20170306040348 {"warc-type": "request"} -""" - assert res == exp - - def test_warc_index_custom_fields_2(self): - res = self.index_file( - "cc.warc.gz", records="all", replace_fields="method,mime,warc-type,date" - ) - - exp = """\ -org,commoncrawl)/ 20170722005011 {"method": "GET", "warc-type": "request"} -org,commoncrawl)/ 20170722005011 {"method": "GET", "mime": "text/html", "warc-type": "response"} -org,commoncrawl)/ 20170722005011 {"mime": "application/warc-fields", "warc-type": "metadata"} -""" - assert res == exp - - def test_cdxj_empty(self): - output = StringIO() - - empty = BytesIO() - - opts = {"filename": "empty.warc.gz"} - - write_cdx_index(output, empty, opts) - - assert output.getvalue() == "" - - def test_cdxj_middle_empty_records(self): - empty_gzip_record = b"\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" - - new_warc = BytesIO() - - with open(os.path.join(TEST_DIR, "example.warc.gz"), "rb") as fh: - new_warc.write(empty_gzip_record) - new_warc.write(fh.read()) - new_warc.write(empty_gzip_record) - new_warc.write(empty_gzip_record) - fh.seek(0) - new_warc.write(fh.read()) - - new_warc.seek(0) - - output = StringIO() - opts = {"filename": "empty.warc.gz"} - - write_cdx_index(output, new_warc, opts) - - lines = output.getvalue().rstrip().split("\n") - - assert len(lines) == 4, lines - - def test_missing_http(self): - res = self.index_file("missing-http.warc.gz", post=True) - - exp = """\ -com,zoubanio,img1)/icon/u3927203-87.jpg 20170430113919 {"url": "https://img1.zoubanio.com/icon/u3927203-87.jpg", "digest": "sha1:FARQCVFYY7UZ5O5ZKSERUXHUUGEGL4IO", "length": "307", "offset": "0", "filename": "missing-http.warc.gz"} -""" - assert res == exp - - -class CustomIndexer(CDXJIndexer): - def process_index_entry(self, it, record, *args): - type_ = record.rec_headers.get("WARC-Type") - if type_ == "response" and record.http_headers.get("Content-Type").startswith( - "text/html" - ): - assert record.buffered_stream.read() != b"" - - -def test_custom_indexer(): - output = StringIO() - indexer = CustomIndexer( - output=output, - inputs=[os.path.join(TEST_DIR, "example.warc.gz")], - fields="referrer", - ) - - assert indexer.collect_records - - indexer.process_all() diff --git a/tests/cdxj_indexer/test_postappend.py b/tests/cdxj_indexer/test_postappend.py index 1b803ca7..6bcc35e8 100644 --- a/tests/cdxj_indexer/test_postappend.py +++ b/tests/cdxj_indexer/test_postappend.py @@ -1,10 +1,6 @@ from io import BytesIO -from pyamf import AMF3 -from pyamf.remoting import Request, Envelope, encode - -from cdxj_indexer.postquery import append_method_query -from cdxj_indexer.amf import amf_parse +from warc2zim.cdxj_indexer.postquery import append_method_query # ============================================================================ @@ -24,7 +20,7 @@ def append_query(self, url): # ============================================================================ -class TestPostQueryExtract(object): +class TestPostQueryExtract: @classmethod def setup_class(cls): cls.post_data = b"foo=bar&dir=%2Fbaz" @@ -205,16 +201,3 @@ def test_head(self): mq.append_query("http://example.com/") == "http://example.com/?__wb_method=HEAD" ) - - def test_amf_parse(self): - mq = MethodQueryCanonicalizer("POST", "application/x-amf", 0, BytesIO()) - - req = Request(target="t", body="") - ev_1 = Envelope(AMF3) - ev_1["/0"] = req - - req = Request(target="t", body="alt_content") - ev_2 = Envelope(AMF3) - ev_2["/0"] = req - - assert amf_parse(encode(ev_1).getvalue()) != amf_parse(encode(ev_2).getvalue())