Skip to content

Commit

Permalink
Remove unused code/tests + 'modernize'
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Dec 20, 2024
1 parent 0080cc3 commit 3cced66
Show file tree
Hide file tree
Showing 18 changed files with 44 additions and 1,131 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Upgrade to wombat 3.8.6 (#334)
- Fix wombat setup settings (especially `isSW`) (#293)
- Fork cdxj_indexer codebase (#428)

### Fixed

Expand Down
2 changes: 2 additions & 0 deletions docs/software_architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ It provide two main features:

Except that, scraper directly uses WarcRecord (returned by cdxj_indexer, implemented in warcio) to access metadata and such.

cdxj_indexer usefull methods are currently forked in warc2zim, see https://github.com/openzim/warc2zim/pull/428 for details.

## zimscraperlib

[zimscraperlib Python library](https://pypi.org/project/zimscraperlib) is used for ZIM operations.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ dependencies = [
"jinja2==3.1.4", # also update version in build-system above and in build_js.sh
# to support possible brotli content in warcs, must be added separately
"brotlipy==0.7.0",
"cdxj_indexer==1.4.6",
"tinycss2==1.4.0",
"beautifulsoup4==4.12.3", # used to parse base href
"lxml==5.3.0", # used by beautifulsoup4 for parsing html
"python-dateutil==2.9.0.post0",
"multipart==1.2.1",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
12 changes: 9 additions & 3 deletions src/warc2zim/cdxj_indexer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
from cdxj_indexer.main import CDXJIndexer, iter_file_or_dir
from cdxj_indexer.postquery import append_method_query_from_req_resp
from cdxj_indexer.bufferiter import buffering_record_iter
from warc2zim.cdxj_indexer.bufferiter import buffering_record_iter
from warc2zim.cdxj_indexer.main import iter_file_or_dir
from warc2zim.cdxj_indexer.postquery import append_method_query_from_req_resp

__all__ = [
"append_method_query_from_req_resp",
"buffering_record_iter",
"iter_file_or_dir",
]
84 changes: 0 additions & 84 deletions src/warc2zim/cdxj_indexer/amf.py

This file was deleted.

16 changes: 7 additions & 9 deletions src/warc2zim/cdxj_indexer/bufferiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
import shutil
import tempfile

from cdxj_indexer.postquery import append_method_query_from_req_resp

from warc2zim.cdxj_indexer.postquery import append_method_query_from_req_resp

BUFF_SIZE = 1024 * 64


# ============================================================================
def buffering_record_iter(
record_iter, post_append=False, digest_reader=None, url_key_func=None
record_iter, digest_reader=None, url_key_func=None, *, post_append=False
):
prev_record = None

Expand All @@ -30,10 +29,8 @@ def buffering_record_iter(

if digest_length != record.file_length:
raise Exception(
"Digest block mismatch, expected {0}, got {1}".format(
record.file_length,
digest_length,
)
f"Digest block mismatch, expected {record.file_length}, "
f"got {digest_length}"
)

record.record_digest = record_digest
Expand All @@ -50,7 +47,8 @@ def buffering_record_iter(
join_req_resp(req, resp, post_append, url_key_func)

yield prev_record
prev_record.buffered_stream.close()
if prev_record:
prev_record.buffered_stream.close()
yield record
record.buffered_stream.close()
prev_record = None
Expand Down Expand Up @@ -107,7 +105,7 @@ def join_req_resp(req, resp, post_append, url_key_func=None):
method = req.http_headers.protocol
if post_append and method.upper() in ("POST", "PUT"):
url = req.rec_headers.get_header("WARC-Target-URI")
query, append_str = append_method_query_from_req_resp(req, resp)
query, append_str = append_method_query_from_req_resp(req)
resp.method = method.upper()
resp.requestBody = query
resp.urlkey = url + append_str
Expand Down
Loading

0 comments on commit 3cced66

Please sign in to comment.