-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwoodenstreet_spider.py
130 lines (113 loc) · 6.03 KB
/
woodenstreet_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import scrapy
import csv
import json
import re
from html.parser import HTMLParser
def remove_non_ascii(text):
return ''.join([i if ord(i) < 128 else '' for i in text])
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs= True
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
class HCVID195_www_woodenstreet_com_Spider(scrapy.Spider):
name = "HCVID195.www.woodenstreet.com"
jsonData = []
def start_requests(self):
link = 'https://www.woodenstreet.com/dining-furniture'
request = scrapy.Request(url=link, callback = self.parseCat)
yield request
link = 'https://www.woodenstreet.com/kids-furniture'
request = scrapy.Request(url=link, callback = self.parseCat)
yield request
link = 'https://www.woodenstreet.com/living-room-furniture'
request = scrapy.Request(url=link, callback = self.parseCat)
yield request
link = 'https://www.woodenstreet.com/storage-furniture'
request = scrapy.Request(url=link, callback = self.parseCat)
yield request
link = 'https://www.woodenstreet.com/study-room-furniture'
request = scrapy.Request(url=link, callback = self.parseCat)
yield request
link = 'https://www.woodenstreet.com/bedroom-furniture'
request = scrapy.Request(url=link, callback = self.parseCat)
yield request
def parseCat(self, response):
cats = response.css('div.allcategory a::attr(href)').extract()
for cat in cats:
link = 'https://www.woodenstreet.com/' + cat + '?page=1'
request = scrapy.Request(url=link, callback = self.parsePages)
request.meta['page'] = 1
yield request
def parsePages(self, response):
if response.css('div.product-list').extract_first():
articles = response.css('div.product-list article')
for article in articles:
prod_link = article.css('a::attr(href)').extract_first()
if not prod_link:
continue
request = scrapy.Request(url=prod_link, callback = self.parseProduct)
request.meta['prod_link'] = prod_link
yield request
page = response.meta['page'] + 1
nextpage = response.request.url.split('?')[0] + '?page=' + str(page)
next_request = scrapy.Request(url=nextpage, callback = self.parsePages)
next_request.meta['page'] = page
yield next_request
def closed(self, reason):
toWrite = {'data' : HCVID195_www_woodenstreet_com_Spider.jsonData}
with open('HCVID195.www.woodenstreet.com.json', 'w+') as outfile:
json.dump(toWrite, outfile, ensure_ascii=False)
def parseProduct(self, response):
productLink = response.meta['prod_link']
productImage = response.css('li img[itemprop="image"]::attr(src)').extract_first()
productName = strip_tags(response.css('h1.heading.hemedium').extract_first())
productCategories = response.css('div.breadcrumbs li a::text').extract()
productCategory = ''
for catno in range(1,len(productCategories)):
productCategory += productCategories[catno] + '|'
productCategory = productCategory.strip('|')
productDesc = response.css('article#detail').extract_first()
info = response.css('div.text p').extract()
inform = ''
for infono in range(1,len(info)):
inform += info[infono] + '|'
inform = inform.strip('|')
if not productDesc:
productDesc = strip_tags(inform)
else:
productDesc = strip_tags(productDesc) + '|' + strip_tags(inform)
productDesc = productDesc.replace('Never miss our special offers, events or promotions.','')
productSku = response.css('div.text p span:last-child::text').extract_first()
productPrice = ''
if not response.css('p.coupon_our_price::text').extract_first():
productPrice = response.css('p.retprice span#price_container::text').extract_first().replace('Rs','').strip()
else:
productPrice = response.css('p.coupon_our_price::text').extract_first().replace('Our Price Rs','').strip()
productCategory = productCategory.replace('"','``')
productDesc = re.sub('\n' ,'|',productDesc)
productDesc = re.sub('\t','',productDesc)
productDesc = re.sub('\r','',productDesc)
productDesc = re.sub('[ ]+',' ',productDesc)
productDesc = re.sub('(\| )+','|',productDesc)
productDesc = re.sub('\|+','|',productDesc)
productDesc = productDesc.strip('|')
productDesc = productDesc.replace('"','``')
productName = productName.replace('"','``')
productDesc = remove_non_ascii(productDesc)
productCategory = remove_non_ascii(productCategory)
productName = remove_non_ascii(productName)
with open('HCVID195.www.woodenstreet.com.csv', 'a', newline='') as csvfile:
fieldnames = ['Title', 'Category', 'URL', 'IMG_SRC', 'Merchant', 'Price', 'SKU', 'Description']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow({'Title': '"'+str(productName)+'"', 'IMG_SRC': '"'+str(productImage)+'"', 'Description': '"'+str(productDesc)+'"','Price': '"'+str(productPrice)+'"', 'URL': '"'+str(productLink)+'"', 'Category': '"'+str(productCategory)+'"', 'SKU': '"'+str(productSku)+'"', 'Merchant': '"NA"'})
HCVID195_www_woodenstreet_com_Spider.jsonData.append({'Title': productName, 'Description': productDesc, 'IMG_SRC': productImage, 'Price': productPrice, 'SKU': productSku, 'Category': productCategory, 'URL': productLink, 'Merchant': 'NA'})