-
Notifications
You must be signed in to change notification settings - Fork 4
/
scrape_archive_org.py
159 lines (123 loc) · 4.74 KB
/
scrape_archive_org.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import optparse
#import csv
import pandas as pd
import gzip
import time
#import xml.parsers.expat
import requests
#from bs4 import BeautifulSoup
import concurrent.futures
import logging
import os
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(message)s',
handlers=[logging.FileHandler("logs/scrape_archive_org.log"),
logging.StreamHandler()])
__version__ = 'r5 (2022/10/28)'
META_DIR = 'data/meta/'
HTML_DIR = 'data/html/'
MAX_WORKERS = int(os.environ.get("MAX_WORKERS", 3))
def parse_command_line(argv):
"""Command line options parser for the script
"""
usage = "Usage: %prog [options] <CSV input file>"
parser = optparse.OptionParser(usage=usage)
parser.add_option("--meta", action="store",
type="string", dest="meta", default=META_DIR,
help="Meta files directory (default: '{:s}')".format(META_DIR))
parser.add_option("--html", action="store",
type="string", dest="html", default=HTML_DIR,
help="HTML files directory (default: '{:s}')".format(HTML_DIR))
parser.add_option("-s", "--skip", action="store",
type="int", dest="skip", default=0,
help="Skip rows (default: 0)")
parser.add_option("-c", "--compress", action="store_true",
dest="compress", default=False,
help="Compress downloaded files (default: No)")
return parser.parse_args(argv)
def download_file(options, url, local_filename):
# NOTE the stream=True parameter
logging.info("Downloading...[{:s}]".format(url))
r = requests.get(url, stream=True)
if options.compress:
f = gzip.open(local_filename, 'wb')
else:
f = open(local_filename, 'wb')
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
f.close()
def handle_download(_id, retry=0):
try:
_id = _id[0]
file_name = os.path.join(options.meta, _id + "_meta.xml")
if options.compress:
file_name += ".gz"
if not os.path.isfile(file_name):
rq = requests.get('http://archive.org/download/' + _id)
if rq.status_code == 200:
if not rq.url.endswith('/'):
rq.url = rq.url + '/'
url = rq.url + _id + "_meta.xml"
if not os.path.isfile(file_name):
download_file(options, url, file_name)
url = 'http://archive.org/details/' + _id
file_name = os.path.join(options.html, _id + ".html")
if options.compress:
file_name += ".gz"
if not os.path.isfile(file_name):
download_file(options, url, file_name)
except:
if retry > 4:
logging.error(f'id: {_id}. Stopped retrying')
else:
retry += 1
wait_time = 120
logging.warning(f'id: {_id}. Waiting {wait_time} secs and retrying... ')
time.sleep(wait_time)
handle_download(_id, retry=retry)
def parallel_download(identifiers):
with concurrent.futures.ThreadPoolExecutor() as executor:
for r in executor.map(handle_download, identifiers):
if r:
logging.warning(r)
if __name__ == "__main__":
logging.info("{:s} - {:s}\n".format(os.path.basename(sys.argv[0]), __version__))
logging.info(f'Max workers set to {MAX_WORKERS}')
(options, args) = parse_command_line(sys.argv)
if len(args) < 2:
logging.info("Usage: {:s} [options] <CSV input file>".format(os.path.basename(sys.argv[0])))
sys.exit(-1)
if not os.path.exists(options.meta):
os.mkdir(options.meta)
if not os.path.exists(options.html):
os.mkdir(options.html)
# CSV to list
df = pd.read_csv(args[1])
identifiers = [list(row) for row in df.values]
# Consider skip option
if options.skip:
identifiers = identifiers[options.skip:]
# Download
if os.environ.get("ARCHIVE_TEST"):
# Testing
for id_ in identifiers:
handle_download(id_)
else:
# Multithread
total = len(identifiers)
logging.info(f'{total} total identifiers to process...')
downloaded = len(os.listdir(options.html))
logging.info(f'{downloaded} total identifiers downloaded...')
while downloaded < total:
try:
parallel_download(identifiers)
except Exception as e:
logging.warning(f'Restarting: {e}')
time.sleep(120)
logging.info("All done")