This repository has been archived by the owner on May 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patharchiveurl
executable file
·72 lines (58 loc) · 1.84 KB
/
archiveurl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env bash
usage()
{
cat << EOF
Make a POST request to Internet Archive's Wayback Machine to save a URL.
Multiple URLs (one per line) can be provided,
e.g. piping: 'cat urls.txt | archiveurl'
You may use this script together with "geturls"[1] to automatically archive all
links insed a web page,
e.g. 'geturls https://www.openbookpublishers.com/htmlreader/978-1-78374-323-0/main.html | archiveurl'
You may specify a list of URLs that should not be archived, either using the default file './exceptions.txt' or through env vars: 'export EXCEPTION_FILE=/path'.
[1] https://github.com/OpenBookPublishers/geturls
(c) Javier Arias, Open Book Publishers, July 2018
Use of this software is governed by the terms of the MIT -- see LICENSE
OPTIONS:
-h Show this message
EOF
}
BASE="http://web.archive.org/save/"
AGENT="Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0"
DEFAULT_EXCEPT="$(dirname $0)/exceptions.txt"
EXCEPT=${EXCEPTION_FILE:-$DEFAULT_EXCEPT}
while getopts "vh" OPTION
do
case $OPTION in
h)
usage
exit 1
;;
?)
usage
exit
;;
esac
done
while read line; do
url=$line
toarchive="${BASE}${line}"
if [ -f "$EXCEPT" ] && grep -Fxq "$url" $EXCEPT; then
# URL is exempt
continue
fi
code=$(curl --silent \
--location \
--write-out "%{http_code}" \
--user-agent "${AGENT}" \
--output /dev/null \
"${toarchive}")
if [[ "$code" == 2* ]]; then
echo "Archived (${code}): ${url}"
else
>&2 echo "Failed (${code}): ${url}"
if [[ "$code" == 429 ]]; then
# Too Many Requests error - pause before sending next request
sleep 5
fi
fi
done < "${1:-/dev/stdin}"