Skip to content

Commit

Permalink
upgraded to python3 mostly using 2to3 tool
Browse files Browse the repository at this point in the history
  • Loading branch information
gcerretani committed Oct 12, 2019
1 parent f6b27ea commit eaf0795
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 11 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ In the website, navigate to the archive you want to download. For example, for t

Then, copy the link to the first page, and call the script with that link as argument:

./antenati.py http://dl.antenati.san.beniculturali.it/v/Archivio+di+Stato+di+Firenze/Stato+civile+della+restaurazione/Montalcino+provincia+di+Siena/Nati/1832/179/005178080_00303.jpg.html
./antenati.py http://dl.antenati.san.beniculturali.it/v/Archivio+di+Stato+di+Firenze/Stato+civile+della+restaurazione+1816-1860/Montalcino+provincia+di+Siena/Nati/1832/179/005178080_00303.jpg.html

The results will be placed in a folder named *Montalcino_provincia_di_Siena_Nati_1832*.
27 changes: 17 additions & 10 deletions antenati.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""
antenati.py: a tool to download data from the Portale Antenati
"""

__author__ = "Giovanni Cerretani"
__copyright__ = "Copyright (c) 2018, MIT License"

import urllib3
import HTMLParser
import html.parser
import sys
import os
import re
Expand All @@ -16,17 +23,17 @@ def __init__ (self, pool, url, filename):
self.filename = filename
self.start()
def run(self):
print('Downloading ' + self.filename)
print('Downloading ', self.filename)
r = self.pool.request_encode_url('GET', self.url)
f = open(self.filename, 'wb')
f.write(r.data)
f.close()
print('Done ' + self.filename)
print('Done ', self.filename)


class ImageHTMLParser(HTMLParser.HTMLParser):
class ImageHTMLParser(html.parser.HTMLParser):
def __init__(self, pool):
HTMLParser.HTMLParser.__init__(self)
html.parser.HTMLParser.__init__(self)
self.pool = pool
self.filename = None
self.threads = []
Expand All @@ -42,9 +49,9 @@ def handle_starttag(self, tag, attrs):
self.threads.append(t)


class UrlHTMLParser(HTMLParser.HTMLParser):
class UrlHTMLParser(html.parser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
html.parser.HTMLParser.__init__(self)
self.next = None
def set_next(self, next):
self.next = next
Expand Down Expand Up @@ -84,14 +91,14 @@ def main():

while not stop:
stop = True
r = connection_pool.request_encode_url('GET', url_parser.get_next())
r = connection_pool.request('GET', url_parser.get_next())
splitting = re.split('[_/?.]', url_parser.get_next())
html_element = splitting.index('html')
file_name_elements = splitting[html_element - 3 : html_element - 1]
local_filename = '_'.join(file_name_elements)
img_parser.set_filename(local_filename)

for line in r.data.split('\n'):
for line in r.data.decode('utf-8').split('\n'):
if 'zoomAntenati1' in line:
img_parser.feed(line)
if 'successivo' in line:
Expand Down

0 comments on commit eaf0795

Please sign in to comment.