-
Notifications
You must be signed in to change notification settings - Fork 1
/
bf.py
56 lines (46 loc) · 1.68 KB
/
bf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from feeder.spiders import FeederSpider
from urllib.parse import urlencode
from scrapy import Request
from scrapy.utils.response import open_in_browser, body_or_str
from re import search, sub
from feeder.items import Source
import arrow
from re import sub
from json import loads, load
from pprint import pprint
from pdb import set_trace
class BF(FeederSpider):
name = "bf"
base_url = 'http://www.digitalna-knjiznica.bf.uni-lj.si'
allowed_domains = ['*.bf.uni-lj.si']
mode = 'refresh'
@property
def categories(self):
return self.over_categories if self.over_categories else [
'biologija', 'gozdarstvo', 'agronomija', 'zootehnika',
'krajinska-arhitektura', 'lesarstvo', 'mikrobiologija',
'zivilstvo', 'biotehnologija'
]
def start_requests(self):
return (Request("%s/%s.htm" % (self.base_url, category)) for category in self.categories)
def parse(self, response):
return self.parse_index(response)
def parse_index(self, response):
urls = set([self.prepare_url(url) for url in response
.css("a[href$=\".pdf\"]")
.xpath("@href")
.extract()])
return [Source(
domain='bf.uni-lj.si',
scraped_at=arrow.utcnow(),
scraped_url=url,
file_urls=[url]
) for url in urls if not self.exists_by({'scraped_url': url})]
def prepare_url(self, baseUrl):
url = '' + baseUrl.strip()
if not url.startswith('http://'):
if not url.startswith('/'):
url = self.base_url + "/" + url
else:
url = self.base_url + url
return url