diff --git a/README.md b/README.md index 6debe2c..c360cfd 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,6 @@ In the website, navigate to the archive you want to download. For example, for t Then, copy the link to the first page, and call the script with that link as argument: - ./antenati.py http://dl.antenati.san.beniculturali.it/v/Archivio+di+Stato+di+Firenze/Stato+civile+della+restaurazione/Montalcino+provincia+di+Siena/Nati/1832/179/005178080_00303.jpg.html + ./antenati.py http://dl.antenati.san.beniculturali.it/v/Archivio+di+Stato+di+Firenze/Stato+civile+della+restaurazione+1816-1860/Montalcino+provincia+di+Siena/Nati/1832/179/005178080_00303.jpg.html The results will be placed in a folder named *Montalcino_provincia_di_Siena_Nati_1832*. \ No newline at end of file diff --git a/antenati.py b/antenati.py index c28261b..4c199b0 100755 --- a/antenati.py +++ b/antenati.py @@ -1,6 +1,13 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +""" +antenati.py: a tool to download data from the Portale Antenati +""" + +__author__ = "Giovanni Cerretani" +__copyright__ = "Copyright (c) 2018, MIT License" + import urllib3 -import HTMLParser +import html.parser import sys import os import re @@ -16,17 +23,17 @@ def __init__ (self, pool, url, filename): self.filename = filename self.start() def run(self): - print('Downloading ' + self.filename) + print('Downloading ', self.filename) r = self.pool.request_encode_url('GET', self.url) f = open(self.filename, 'wb') f.write(r.data) f.close() - print('Done ' + self.filename) + print('Done ', self.filename) -class ImageHTMLParser(HTMLParser.HTMLParser): +class ImageHTMLParser(html.parser.HTMLParser): def __init__(self, pool): - HTMLParser.HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self) self.pool = pool self.filename = None self.threads = [] @@ -42,9 +49,9 @@ def handle_starttag(self, tag, attrs): self.threads.append(t) -class UrlHTMLParser(HTMLParser.HTMLParser): +class UrlHTMLParser(html.parser.HTMLParser): def __init__(self): - HTMLParser.HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self) self.next = None def set_next(self, next): self.next = next @@ -84,14 +91,14 @@ def main(): while not stop: stop = True - r = connection_pool.request_encode_url('GET', url_parser.get_next()) + r = connection_pool.request('GET', url_parser.get_next()) splitting = re.split('[_/?.]', url_parser.get_next()) html_element = splitting.index('html') file_name_elements = splitting[html_element - 3 : html_element - 1] local_filename = '_'.join(file_name_elements) img_parser.set_filename(local_filename) - for line in r.data.split('\n'): + for line in r.data.decode('utf-8').split('\n'): if 'zoomAntenati1' in line: img_parser.feed(line) if 'successivo' in line: