-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
executable file
·109 lines (92 loc) · 3.81 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This file is part of liberit.
# liberit is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# liberit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with liberit. If not, see <http://www.gnu.org/licenses/>.
# (C) 2012 Stefan Marsiske <[email protected]>
import re, sys
#import lxml.html
from urlparse import urljoin
base="http://treaties.un.org/Pages/ParticipationStatus.aspx"
sigre=re.compile(r'Signature',re.I)
datere=re.compile(r'(Ratification|Accession|Succession|Acceptance)')
countryre=re.compile(r'([^0-9]*)')
from liberit.utils import fetch, jdump, unws, getFrag
def toText(node):
if node is None: return ''
return ''.join([x.strip() for x in node.xpath(".//text()") if x.strip()]).replace(u"\u00A0",' ').strip()
def convertRow(cells, fields):
res={}
for name, i in fields:
tmp=toText(cells[i])
if name=='Country':
tmp=countryre.search(tmp).group(1)
if tmp:
if type(tmp)==type(tuple()):
res['url']=tmp[1]
res[name]=unws(tmp[0])
else:
res[name]=unws(tmp)
return res
def toObj(header):
res=[]
#print ' | '.join([''.join(x.xpath('.//text()')) for x in header.xpath('.//td')])
fields={ 'Country': 0}
for i, field in list(enumerate([''.join(x.xpath('.//text()')) for x in header.xpath('.//td')]))[1:]:
if sigre.search(field):
fields['Signature']=i
elif datere.search(field):
fields['Ratification/Accession/Succession/Acceptance']=i
rows=header.xpath('./following-sibling::tr')
for row in rows:
items=row.xpath('td')
value=convertRow(items, fields.items())
if value:
res.append(value)
return res
def scrape():
for chap in getFrag(base, '//table[@id="ctl00_ContentPlaceHolder1_dgChapterList"]//tr'):
chapter=''.join(chap.xpath('.//span//text()'))
url=urljoin(base,chap.xpath('.//a')[0].get('href'))
print >>sys.stderr, "chapter", chapter, url
for trty in getFrag(url, '//table[@id="ctl00_ContentPlaceHolder1_dgSubChapterList"]//tr'):
treaty=trty.xpath('.//a/text()')[0].split(' ')[0]
tmp=treaty.split(u"\u00A0",1)
treaty=tmp[0].replace(u"\u00A0",' ').strip()
if len(tmp)>1:
city=tmp[1].replace(u"\u00A0",' ').strip()
else:
city=None
url=urljoin(base,trty.xpath('.//a')[0].get('href'))
print >>sys.stderr, 'treaty', treaty
print >>sys.stderr, url
i=0
header=getFrag(url,'//table//tr[@class="tableHdr"]//*[starts-with(.,"Participant")]/ancestor::tr[1]')
if len(header)==0:
continue
pdf=header[0].xpath('//img[@title="View PDF"]/..')[0].get('href')
if pdf:
pdf=urljoin(base,pdf)
for obj in toObj(header[0]):
obj['Chapter']=chapter
obj['Treaty']=treaty
if city:
obj['City']=city
if pdf:
obj['PDF']=pdf
#scraperwiki.sqlite.save(unique_keys=['Treaty', "Country"], data=obj)
yield obj
i+=1
print >>sys.stderr, i
print '['
for obj in scrape():
print jdump(obj).encode('utf8'),','
print ']'