Skip to content

Commit

Permalink
Merge branch 'r/1.5.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
bitdruid committed Aug 24, 2024
1 parent 388cef2 commit 78c6535
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 94 deletions.
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ This tool allows you to download content from the Wayback Machine (archive.org).
```pip install .```
- in a virtual env or use `--break-system-package`

## Usage infos
## Usage infos - important notes

- Linux recommended: On Windows machines, the path length is limited. This can only be overcome by editing the registry. Files that exceed the path length will not be downloaded.
- If you query an explicit file (e.g. a query-string `?query=this` or `login.html`), the `--explicit`-argument is recommended as a wildcard query may lead to an empty result.
- The tool will inform you if your query has an immense amount of snapshots which could consume your system memory and lead to a crash. Consider splitting your query into smaller jobs by specifying a range e.g. `--start 2023 --end 2024` or `--range 1`.

## Arguments

Expand Down Expand Up @@ -97,10 +98,13 @@ Specifies number of retry attempts for failed downloads.
- **`--delay`** `<seconds>`:<br>
Specifies delay between download requests in seconds. Default is no delay (0).

- **`--limit`** `<count>`:<br>
Limits the amount of snapshots to query from the CDX server. If an existing CDX file is injected (with `--cdxinject` or `--auto`), the limit will have no effect.

<!-- - **`--convert-links`**:<br>
If set, all links in the downloaded files will be converted to local links. This is useful for offline browsing. The links are converted to the local path structure. Show output with `--verbosity trace`. -->

**CDX Query Handling:**
**CDX Query Result Handling:**
- **`--cdxbackup`** `<path>`:<br>
Path defaults to output-dir. Saves the result of CDX query as a file. Useful for later downloading snapshots and overcoming refused connections by CDX server due to too many queries. Named as `waybackup_<sanitized_url>.cdx`.

Expand All @@ -111,10 +115,6 @@ Injects a CDX query file to download snapshots. Ensure the query matches the pre
- **`--auto`**:<br>
If set, csv, skip and cdxbackup/cdxinject are handled automatically. Keep the files and folders as they are. Otherwise they will not be recognized when restarting a download.

### Debug

- `--debug`: If set, full traceback will be printed in case of an error. The full exception will be written into `waybackup_error.log`.

### Examples

Download latest snapshot of all files:<br>
Expand Down Expand Up @@ -216,6 +216,10 @@ For list queries:

The csv contains the json response in a table format.

### Debugging

Exceptions will be written into `waybackup_error.log` (each run overwrites the file).

## Contributing

I'm always happy for some feature requests to improve the usability of this tool.
Expand Down
4 changes: 3 additions & 1 deletion pywaybackup/Arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ def __init__(self):

parser = argparse.ArgumentParser(description='Download from wayback machine (archive.org)')
parser.add_argument('-a', '--about', action='version', version='%(prog)s ' + __version__ + ' by @bitdruid -> https://github.com/bitdruid')
parser.add_argument('-d', '--debug', action='store_true', help='Debug mode (Always full traceback and creates an error.log')

required = parser.add_argument_group('required (one exclusive)')
required.add_argument('-u', '--url', type=str, metavar="", help='url (with subdir/subdomain) to download')
Expand All @@ -40,6 +39,7 @@ def __init__(self):
special.add_argument('--workers', type=int, default=1, metavar="", help='number of workers (simultaneous downloads)')
# special.add_argument('--convert-links', action='store_true', help='Convert all links in the files to local paths. Requires -c/--current')
special.add_argument('--delay', type=int, default=0, metavar="", help='delay between each download in seconds')
special.add_argument('--limit', type=int, nargs='?', const=True, metavar='int', help='limit the number of snapshots to download')

cdx = parser.add_argument_group('cdx (one exclusive)')
exclusive_cdx = cdx.add_mutually_exclusive_group()
Expand Down Expand Up @@ -84,6 +84,8 @@ def init(cls):
if cls.current:
cls.mode = "current"

cls.cdxbackup = cls.output if cls.cdxbackup is None else cls.cdxbackup

if cls.auto:
cls.skip = cls.output
cls.csv = cls.output
Expand Down
55 changes: 26 additions & 29 deletions pywaybackup/Exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,14 @@
class Exception:

new_debug = True
debug = False
output = None
command = None

@classmethod
def init(cls, debug=False, output=None, command=None):
def init(cls, output=None, command=None):
sys.excepthook = cls.exception_handler # set custom exception handler (uncaught exceptions)
cls.output = output
cls.command = command
cls.debug = True if debug else False

@classmethod
def exception(cls, message: str, e: Exception, tb=None):
Expand Down Expand Up @@ -53,33 +51,32 @@ def exception(cls, message: str, e: Exception, tb=None):
"-------------------------"
)
print(exception_message)
if cls.debug:
debug_file = os.path.join(cls.output, "waybackup_error.log")
print(f"Exception log: {debug_file}")
print("-------------------------")
print(f"Full traceback:\n{original_tb}")
if cls.new_debug: # new run, overwrite file
cls.new_debug = False
f = open(debug_file, "w")
f.write("-------------------------\n")
f.write(f"Version: {__version__}\n")
f.write("-------------------------\n")
f.write(f"Command: {cls.command}\n")
f.write("-------------------------\n\n")
else: # current run, append to file
f = open(debug_file, "a")
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
f.write(exception_message + "\n")
f.write("!-- Local Variables:\n")
for var_name, value in local_vars.items():
if var_name in ["status_message", "headers"]:
continue
value = cls.relativate_path(str(value))
value = value[:666] + " ... " if len(value) > 666 else value
f.write(f" -- {var_name} = {value}\n")
debug_file = os.path.join(cls.output, "waybackup_error.log")
print(f"Exception log: {debug_file}")
# print("-------------------------")
# print(f"Full traceback:\n{original_tb}")
if cls.new_debug: # new run, overwrite file
cls.new_debug = False
f = open(debug_file, "w")
f.write("-------------------------\n")
f.write(original_tb + "\n")
f.close()
f.write(f"Version: {__version__}\n")
f.write("-------------------------\n")
f.write(f"Command: {cls.command}\n")
f.write("-------------------------\n\n")
else: # current run, append to file
f = open(debug_file, "a")
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
f.write(exception_message + "\n")
f.write("!-- Local Variables:\n")
for var_name, value in local_vars.items():
if var_name in ["status_message", "headers"]:
continue
value = cls.relativate_path(str(value))
value = value[:666] + " ... " if len(value) > 666 else value
f.write(f" -- {var_name} = {value}\n")
f.write("-------------------------\n")
f.write(original_tb + "\n")
f.close()

@classmethod
def relativate_path(cls, input: str) -> str:
Expand Down
19 changes: 16 additions & 3 deletions pywaybackup/SnapshotCollection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pywaybackup.helper import url_split
import json
import os

class SnapshotCollection:
Expand All @@ -7,15 +8,27 @@ class SnapshotCollection:
MODE_CURRENT = 0

@classmethod
def create_list(cls, cdxResult, mode):
def create_list(cls, cdxfile, mode):
"""
Create the snapshot collection list from a cdx result.
- mode `full`: All snapshots are included.
- mode `current`: Only the latest snapshot of each file is included.
"""
# creates a list of dictionaries for each snapshot entry
cls.SNAPSHOT_COLLECTION = sorted([{"timestamp": snapshot[0], "digest": snapshot[1], "mimetype": snapshot[2], "status": snapshot[3], "url": snapshot[4]} for snapshot in cdxResult[1:]], key=lambda k: k['timestamp'], reverse=True)
with open(cdxfile, "r") as f:
first_line = True
for line in f:
if first_line:
first_line = False
continue
line = line.strip()
if line.endswith("]]"): line = line.rsplit("]", 1)[0]
if line.endswith(","): line = line.rsplit(",", 1)[0]
else: continue # drop incomplete line, maybe cdx response was cut off
line = json.loads(line)
line = {"timestamp": line[0], "digest": line[1], "mimetype": line[2], "status": line[3], "url": line[4]}
cls.SNAPSHOT_COLLECTION.append(line)
cls.SNAPSHOT_COLLECTION = sorted(cls.SNAPSHOT_COLLECTION, key=lambda k: k['timestamp'], reverse=True)
if mode == "current":
cls.MODE_CURRENT = 1
cdxResult_list_filtered = []
Expand Down
3 changes: 2 additions & 1 deletion pywaybackup/Verbosity.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,13 @@ def __init__(self):
self.message = {}

def __str__(self):
return self.message
return str(self.message)

def store(self, status: str = "", type: str = "", message: str = "", level: str = "info"):
if level not in self.message:
self.message[level] = []
self.message[level].append(super().generate_logline(status, type, message))
#super().write(message=f"Stored message: {status} -> {type}: {message}")

def clear(self):
self.message = {}
Expand Down
2 changes: 1 addition & 1 deletion pywaybackup/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.4.0"
__version__ = "1.5.0"
Loading

0 comments on commit 78c6535

Please sign in to comment.