-
Notifications
You must be signed in to change notification settings - Fork 0
/
dc_scrap.py
104 lines (93 loc) · 4.89 KB
/
dc_scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
import scrapy
import urlparse
import urllib2
from xml.dom.minidom import parseString
import re
cot_pat_av = re.compile(r'a(\d+)v(\d+)([IV]+)')
cot_pat_va = re.compile(r'v(\d+)a(\d+)([IV]+)')
romain2arab = {'I':1,'II':2,'III':3,'IV':4,'V':5,'VI':6,'VII':7}
class BlogSpider(scrapy.Spider):
name = 'canyonspider'
start_urls = ['http://www.descente-canyon.com/canyoning/lieux/11/21/France.html',
'http://www.descente-canyon.com/canyoning/lieux/11/22/Italie.html']
def parse(self, response):
print 'parse %s'%response.url
for href in response.css('#table-departement > tbody > tr > td a::attr(href)'):
full_url = urlparse.urljoin(response.url, href.extract())
if full_url.startswith('http://www.descente-canyon.com/boutique/') or full_url.startswith('http://www.descente-canyon.com/canyoning/topoguide/'):
continue
yield scrapy.Request(full_url, callback=self.parse_province)
def parse_province(self, response):
print 'parse_province %s'%response.url
name = response.css('#content > h4').extract_first().split(' > ')[-1]
assert(name.endswith('</h4>'))
name = name[:-5]
for href in response.css('#table-1 > tbody > tr > td a::attr(href)'):
full_url = urlparse.urljoin(response.url, href.extract())
if full_url.startswith('http://www.descente-canyon.com/boutique/') or full_url.startswith('http://www.descente-canyon.com/canyoning/topoguide/') or full_url.startswith('http://www.descente-canyon.com/canyoning/canyon-new-geo/'):
continue
cid = href.extract().split('/')[3]
yield scrapy.Request(full_url, callback=self.parse_canyon, meta={'cid':cid,'province_name':name})
def parse_canyon(self, response):
print 'parse_canyon %s'%response.url
location = {}
title = response.css('h1::text').extract_first()
try:
interet = float(filter(lambda x:not(x.startswith("Attention")),response.css('.fichetechnique strong::text').extract())[0])
except:
interet = -1
valeurs = []
for valeur in response.css('table.fichetechnique td.valeur'):
for a in valeur.css('a::text'):
valeurs.append(a.extract())
for t in valeur.css('td::text'):
valeurs.append(t.extract())
if len(valeurs)!=10:
print 'error valeurs ',response.url
altdep,cotation,aller,deniv,longcorde,descente,longueur,retour,cascade,navette=valeurs
geodoc = parseString(urllib2.urlopen('http://www.descente-canyon.com/canyoning/localized-point-search?t=xml2&idc='+response.meta['cid'][1:]).read())
for marker in geodoc.getElementsByTagName('marker'):
lat = marker.getAttributeNode('lat').nodeValue
lon = marker.getAttributeNode('lng').nodeValue
label = marker.getAttributeNode('label').nodeValue
if label.startswith('parking'):
location={'lat':float(lat),'lon':float(lon),'nature':'parking'}
break
if label.startswith('depart'):
location={'lat':float(lat),'lon':float(lon),'nature':'depart'}
if label.startswith('arrivee'):
location={'lat':float(lat),'lon':float(lon),'nature':'arrivee'}
doc = {'src_url':response.url,'localisation':location,'title':title,'interet':interet,'aller':aller,'descente':descente,'ret':retour}
doc.update(response.meta)
if longueur!='??':
assert(longueur.endswith('m'))
doc['lg'] = int(longueur[:-1])
if longcorde not in ('??','...'):
assert(longcorde.endswith('m'))
doc['corde'] = int(longcorde[:-1])
if cascade not in ('??','...'):
assert(cascade.endswith('m'))
doc['cascade'] = int(cascade[:-1])
if deniv not in ('??','...'):
assert(deniv.endswith('m'))
doc['deniv'] = int(deniv[:-1])
if altdep not in ('??','...'):
assert(altdep.endswith('m'))
doc['alt'] = int(altdep[:-1])
if cotation not in ('??','...'):
if cotation.startswith('a'):
a,v,e = cot_pat_av.findall(cotation)[0]
else:
v,a,e = cot_pat_va.findall(cotation)[0]
doc['cot'] = {'a':a,'v':v,'e':romain2arab[e]}
if navette not in ('??','...') and not navette.endswith('ant'): #TODO: néant utf8
assert(navette.endswith('km'))
doc['navette'] = float(navette[:-2])
commune = response.css('a[title="Voir les canyons de cette commune"]').xpath('text()').extract_first()
if commune!=None:
doc['commune'] = commune
bassin = response.css('a[title="Voir les canyons qui alimentent ce bassin"]').xpath('text()').extract_first()
if bassin!=None:
doc['bassin'] = bassin
return doc