-
Notifications
You must be signed in to change notification settings - Fork 3
/
iiif_downloader.py
208 lines (183 loc) · 11.6 KB
/
iiif_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
__author__ = 'Ernesto Coto'
__copyright__ = 'Jan 2020'
import argparse
import csv
import json
import os
import re
import string
import requests
from PIL import Image
# Constants
IMAGE_MAX_WIDTH = "full"
VERIFY_SSL_CERTIFICATE = False
# Functions
def download_iiif_content(document_url, images_base_path, metadata_file_path, image_max_width, verify_ssl_certificate):
"""
Downloads the images and metadata from a public JSON IIIF document.
Arguments:
document_url: URL pointing to the document.
images_base_path: Base path folder where to store the downloaded images.
metadata_file_path: Path to a CSV file where to store the downloaded metadata.
image_max_width: Max image width of downloaded files.
"""
print ('=======')
print ('Processing document at %s' % document_url)
pattern = re.compile('[^a-zA-Z0-9_]')
string_accepted = pattern.sub('', string.printable)
doc_label = doc_attribution = doc_description = None
images_counter = 0
images_metadata = []
try:
response = requests.get(document_url, allow_redirects=True, verify=verify_ssl_certificate)
document = response.json()
if document['@type'] not in [ "sc:Manifest", "sc:Sequence", "sc:Canvas"]:
raise Exception("Only documents of type sc:Manifest, sc:Sequence, sc:Canvas are supported")
if 'label' in document:
doc_label = document['label']
if 'attribution' in document:
doc_attribution = document['attribution']
if 'description' in document:
doc_description = document['description']
destination_folder_name = ''.join(filter(lambda afunc: afunc in string_accepted, document['@id']))
destination_folder_path = os.path.join(images_base_path, destination_folder_name)
if os.path.exists(destination_folder_path):
raise Exception("An image folder for this document already exists. Aborting !")
else:
os.mkdir(destination_folder_path)
iterable = {}
if document['@type'] == "sc:Manifest":
iterable = document
if document['@type'] == "sc:Sequence":
iterable['sequences'] = []
iterable['sequences'].append( {'canvases': document['canvases'] } )
if document['@type'] == "sc:Canvas":
iterable['sequences'] = []
iterable['sequences'].append( {'canvases': [] } )
iterable['sequences'][0]['canvases'].append( {'images': document['images'] } )
for sequence in iterable['sequences']:
for canvas in sequence['canvases']:
canvas_label = None
if 'label' in canvas:
canvas_label = canvas['label']
for image in canvas['images']:
destination_file_path = None
image_url = None
try:
if 'resource' in image and ( ('format' in image['resource'] and 'image' in image['resource']['format']) or
('@type' in image['resource'] and image['resource']['@type']=='dctypes:Image' ) ) :
scale_image = False
if 'service' in image['resource']:
if image_max_width.isdigit():
image_max_width = image_max_width + ','
# check the context for the API version
if '@context' in image['resource']['service'] and '/1/' in image['resource']['service']['@context']:
# attempt to retrieve files named 'native' if API v1.1 is used
image_url = image['resource']['service']['@id'] + '/full/' + str(image_max_width) + '/0/native'
else:
# attempt to retrieve files named 'default' otherwise
image_url = image['resource']['service']['@id'] + '/full/' + str(image_max_width) + '/0/default'
# avoid an (ocasionally) incorrect double // when building the URL
image_url = image_url.replace('//full','/full')
# check if image can be downloaded without specifyng the format...
head_response = requests.head(image_url, allow_redirects=True, verify=True)
if head_response.status_code != 200:
# ... try get the format otherwise
response = requests.get(image['resource']['service']['@id'] + '/info.json', allow_redirects=True, verify=verify_ssl_certificate)
service_document = response.json()
if len(service_document['profile']) > 1:
service_profiles = service_document['profile'][1:] # 0 is always a compliance URL
if 'formats' in service_profiles[0]:
image_format = service_profiles[0]['formats'][0] # just use the first format
image_url = image_url + '.' + image_format
else:
image_url = image['resource']['@id']
scale_image = True
else:
image_url = image['resource']['@id']
scale_image = True
else:
image_url = image['resource']['@id']
scale_image = True
print ('Downloading %s' % image_url)
destination_file_path = os.path.join(destination_folder_path, str(images_counter).zfill(4) )
r = requests.get(image_url, allow_redirects=True, verify=verify_ssl_certificate)
with open(destination_file_path, 'wb') as newimg:
newimg.write(r.content)
if scale_image:
img = Image.open(destination_file_path)
imW, imH = img.size
if imW > image_max_width: # make sure we are downscaling
scale = float(image_max_width)/imW
img.thumbnail((int(imW*scale), int(imH*scale)), resample=Image.BICUBIC)
img.convert('RGB').save(destination_file_path + '.jpg', 'JPEG') # always store jpg
os.remove(destination_file_path)
else:
img = Image.open(destination_file_path)
img.convert('RGB').save(destination_file_path + '.jpg', 'JPEG') # always store jpg
os.remove(destination_file_path)
img_metadata = { 'filename': os.path.join(destination_folder_name, str(images_counter).zfill(4) + '.jpg') }
if metadata_file_path:
# save more metadata of current image
img_metadata['file_attributes'] = { }
if canvas_label:
img_metadata['file_attributes']['caption'] = canvas_label
if doc_label:
img_metadata['file_attributes']['document_label'] = doc_label
if doc_attribution:
img_metadata['file_attributes']['document_attribution'] = doc_attribution
if doc_description:
img_metadata['file_attributes']['document_description'] = doc_description
images_metadata.append(img_metadata)
images_counter = images_counter + 1
except Exception as e:
# remove the file if it was not successfully processed
if destination_file_path and os.path.exists(destination_file_path):
os.remove(destination_file_path)
print ('Exception while accessing image at url %s, skipping. Problem: %s' % (image_url, str(e)))
pass
# Save metadata to CSV, if required
if metadata_file_path:
print ('Saving metadata to %s' % metadata_file_path)
metadata_file_handler = None
if os.path.exists(metadata_file_path):
metadata_file_handler = open(metadata_file_path, 'a')
else:
metadata_file_handler = open(metadata_file_path, 'w')
metadata_file_handler.write('#filename,file_attributes\n')
csv_writer = csv.writer(metadata_file_handler, lineterminator='\n', delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for item in images_metadata:
csv_writer.writerow( [item['filename'], json.dumps(item['file_attributes']) ] )
metadata_file_handler.close()
downloaded_images_file = os.path.join(images_base_path, 'downloaded_images.txt')
print ('Saving list of downloaded files to %s' % downloaded_images_file)
with open(downloaded_images_file, 'a' ) as new_files_list:
for item in images_metadata:
new_files_list.write(item['filename'] + '\n')
except Exception as e:
print (e)
pass
print ('=======')
def main():
""" Main method """
# Parse arguments
parser = argparse.ArgumentParser(description='IIIF Data Downloader')
parser.add_argument('iif_document_url', metavar='iif_document_url', type=str, help='URL to IIIF document')
parser.add_argument('images_base_path', metavar='images_base_path', type=str, help='Base folder to store downloaded images')
parser.add_argument('-m', dest='metadata_file_path', type=str, default=None, help='Path to the CSV file where to store the downloaded metadata. If equal to "None" no metadata will be downloaded. Default: "None"')
parser.add_argument('-w', dest='image_max_width', type=str, default=IMAGE_MAX_WIDTH, help='Maximum width (in pixels) of downloaded images. Default: %s' % IMAGE_MAX_WIDTH)
parser.add_argument('-c', dest='verify_ssl_certificate', default=VERIFY_SSL_CERTIFICATE, action='store_true', help='Enables SSL certificate verification when accessing the manifest and images. Default: %s' % VERIFY_SSL_CERTIFICATE)
args = parser.parse_args()
if not os.path.exists(args.images_base_path):
os.makedirs(args.images_base_path)
if args.metadata_file_path and not os.path.exists(os.path.dirname(args.metadata_file_path)):
os.makedirs(os.path.dirname(args.metadata_file_path))
response = requests.get(args.iif_document_url, allow_redirects=True, verify=args.verify_ssl_certificate)
document = response.json()
if document['@type'] in [ "sc:Collection" ]:
for manifest in document['manifests']:
download_iiif_content(manifest['@id'], args.images_base_path, args.metadata_file_path, args.image_max_width, args.verify_ssl_certificate)
elif document['@type'] in [ "sc:Manifest", "sc:Sequence", "sc:Canvas"]:
download_iiif_content(args.iif_document_url, args.images_base_path, args.metadata_file_path, args.image_max_width, args.verify_ssl_certificate)
if __name__== "__main__":
main()