-
Notifications
You must be signed in to change notification settings - Fork 1
/
getreview.py
146 lines (127 loc) · 6.47 KB
/
getreview.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from lxml import html
from json import dump,loads
from requests import get
import json
from re import sub
from dateutil import parser as dateparser
from time import sleep
import geturls as url
import Constants
def ParseReviews(asin):
amazon_url = 'http://www.amazon.in/dp/'+asin
# Add some recent user agent to prevent amazon from blocking the request
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
for i in range(5):
response = get(asin, headers = headers, verify=False, timeout=30)
if response.status_code == 404:
return {"url": amazon_url, "error": "page not found"}
if response.status_code != 200:
continue
# Removing the null bytes from the response.
cleaned_response = response.text.replace('\x00', '')
parser = html.fromstring(cleaned_response)
XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]'
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]'
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]'
XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr'
XPATH_PRODUCT_NAME = '//h1//span[@id="productTitle"]//text()'
XPATH_PRODUCT_PRICE = '//span[@id="priceblock_ourprice"]/text()'
XPATH_OVERALL = '//*[@id="reviewsMedley"]/div/div[1]/div[1]/div/div/div[2]/div/span/span/a/span/text()'
overall = parser.xpath(XPATH_OVERALL)
try:
overall=overall[0]
overall=overall.replace(' out of 5 ','')
overall=overall.replace('stars','')
except IndexError:
overall=0
raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE)
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME)
total_ratings = parser.xpath(XPATH_AGGREGATE_RATING)
reviews = parser.xpath(XPATH_REVIEW_SECTION_1)
product_price = ''.join(raw_product_price).replace(',', '')
product_name = ''.join(raw_product_name).strip()
if not reviews:
reviews = parser.xpath(XPATH_REVIEW_SECTION_2)
ratings_dict = {}
reviews_list = []
# Grabing the rating section in product page
for ratings in total_ratings:
extracted_rating = ratings.xpath('./td//a//text()')
if extracted_rating:
rating_key = extracted_rating[0]
raw_raing_value = extracted_rating[1]
rating_value = raw_raing_value
if rating_key:
ratings_dict.update({rating_key: rating_value})
# Parsing individual reviews
for review in reviews:
XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()'
XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()'
XPATH_REVIEW_POSTED_DATE = './/span[@data-hook="review-date"]//text()'
XPATH_REVIEW_TEXT_1 = './/div[@data-hook="review-collapsed"]//text()'
XPATH_REVIEW_TEXT_2 = './/div//span[@data-action="columnbalancing-showfullreview"]/@data-columnbalancing-showfullreview'
XPATH_REVIEW_COMMENTS = './/span[@data-hook="review-comment"]//text()'
XPATH_AUTHOR = './/span[contains(@class,"profile-name")]//text()'
XPATH_REVIEW_TEXT_3 = './/div[contains(@id,"dpReviews")]/div/text()'
raw_review_author = review.xpath(XPATH_AUTHOR)
raw_review_rating = review.xpath(XPATH_RATING)
raw_review_header = review.xpath(XPATH_REVIEW_HEADER)
raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE)
raw_review_text1 = review.xpath(XPATH_REVIEW_TEXT_1)
raw_review_text2 = review.xpath(XPATH_REVIEW_TEXT_2)
raw_review_text3 = review.xpath(XPATH_REVIEW_TEXT_3)
# Cleaning data
author = ' '.join(' '.join(raw_review_author).split())
review_rating = ''.join(raw_review_rating).replace('out of 5 stars', '')
review_header = ' '.join(' '.join(raw_review_header).split())
try:
review_posted_date = dateparser.parse(''.join(raw_review_posted_date)).strftime('%d %b %Y')
except:
review_posted_date = None
review_text = ' '.join(' '.join(raw_review_text1).split())
# Grabbing hidden comments if present
if raw_review_text2:
json_loaded_review_data = loads(raw_review_text2[0])
json_loaded_review_data_text = json_loaded_review_data['rest']
cleaned_json_loaded_review_data_text = re.sub('<.*?>', '', json_loaded_review_data_text)
full_review_text = review_text+cleaned_json_loaded_review_data_text
else:
full_review_text = review_text
if not raw_review_text1:
full_review_text = ' '.join(' '.join(raw_review_text3).split())
raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS)
review_comments = ''.join(raw_review_comments)
review_comments = sub('[A-Za-z]', '', review_comments).strip()
review_dict = {
'review_comment_count': review_comments,
'review_text': full_review_text,
'review_posted_date': review_posted_date,
'review_header': review_header,
'review_rating': review_rating,
'review_author': author
}
reviews_list.append(review_dict)
data = {
'ratings': ratings_dict,
'reviews': reviews_list,
'url': amazon_url,
'name': product_name,
'price': product_price,
'overall rating':overall
}
return data
return {"error": "failed to process the page", "url": amazon_url}
def ReadAsin(phoneName):
extracted_data = []
urllist=url.geturllist(phoneName,"amazon.in")
print(urllist)
if len(urllist)==0:
print("No amazon url...........................................")
return ""
print("GOT AMAZON URL..............................................")
extracted_data.append(ParseReviews(urllist[0]))
f = open(Constants.base_file+'\\amazonReviews\\data.json', 'w')
dump(extracted_data, f, indent=4)
return extracted_data
f.close()