forked from zhanghangorg/sqlinj-ant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmyspider.py
27 lines (22 loc) · 869 Bytes
/
myspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import scrapy
class BlogSpider(scrapy.Spider):
name = 'youzuspider'
start_urls = ['http://www.youzu.com']
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
newurls = hxs.select('//a/@href').extract()
validurls = []
for url in newurls:
#判断URL是否合法
if true:
validurls.append(url)
items.extend([self.make_requests_from_url(url).replace(callback=self.parse) for url in validurls])
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/@href').extract()
item['desc'] = site.select('text()').extract()
items.append(item)