-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalibaba_scrapper.py
109 lines (91 loc) · 4.88 KB
/
alibaba_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
from bs4 import BeautifulSoup
import bs4
import re
import pandas as pd
def init_attribute_holders(attributes_for_selection, attribute_values):
for attribute in attributes_for_selection:
attribute_values[attribute] = []
def update_attributes(soup_element, attributes_for_selection, attribute_values):
soup_attribute_name = soup_element.find('span')['title']
for attribute_name in attributes_for_selection:
if attribute_name == soup_attribute_name:
soup_value = soup_element.find('div')['title']
attribute_values[soup_attribute_name].append(soup_value)
return soup_attribute_name
def fill_in_not_updated_attributes(not_updated_attributes, attribute_values):
for attribute in not_updated_attributes:
attribute_values[attribute].append(None)
def scrap_item(link, attributes_for_selection, attribute_values):
page = requests.get(link)
# check the page status; if success then it should be 200
if page.status_code != 200:
return
soup = BeautifulSoup(page.content, 'html.parser')
# this selector gets the tag where the main image is located
main_image_element = soup.select_one('img#J-dcv-image-trigger')
# to check if the image element exists'(some pages have video instead)
if main_image_element is None:
return
attribute_values["image_link"].append(main_image_element["data-src"])
# the following element contains product details; 0 element is the list of details
details = soup.select_one('div.do-entry-list')
not_updates_attributes = set(attributes_for_selection)
for detail in details:
# There are strings in the details list too, this condition prevents them from scrapping
if isinstance(detail, bs4.element.Tag):
updated_attribute = update_attributes(detail, attributes_for_selection, attribute_values)
if updated_attribute in not_updates_attributes:
not_updates_attributes.remove(updated_attribute)
# to make the length of a list of values the same for a postprocessing step, not present values are updated as well
fill_in_not_updated_attributes(not_updates_attributes, attribute_values)
return
def get_next_page_link(soup, current_page_number, seller_main_page):
pagination_list = soup.select_one('div.next-pagination-list')
displayed_pages = pagination_list.select('a.next-pagination-item')
for page in displayed_pages:
# the next page number must be higher than the current.
# since page number are shown in ascending order, this check will allow to select only the next page
if int(page.text) > current_page_number:
next_page_link = seller_main_page + page["href"]
return next_page_link
def scrap_items(soup, attributes_for_selection, attribute_values, seller_main_page):
product_list = soup.select_one('div.component-product-list')
items = product_list.select('div.product-info')
item_links = []
for item in items:
item_links.append(seller_main_page + item.select_one('a.title-link')['href'])
for item_link in item_links:
scrap_item(item_link, attributes_for_selection, attribute_values)
def scrap_items_page(item_list_link, seller_main_page, current_page_number, attributes_for_selection, attribute_values):
page = requests.get(item_list_link)
# check the page status; if success then it should be 200
if page.status_code != 200:
return
soup = BeautifulSoup(page.content, 'html.parser')
scrap_items(soup, attributes_for_selection, attribute_values, seller_main_page)
next_page_link = get_next_page_link(soup, current_page_number, seller_main_page)
return next_page_link
def scrap_init_seller_page(init_link, attributes_for_selection, attribute_values):
page = requests.get(init_link)
seller_main_page = re.search("https://[\w\.]+", init_link).group(0)
# check the page status; if success then it should be 200
if page.status_code != 200:
return
# the initial page is the first page with items
next_seller_page = init_link
# initial page is 1
current_page_number = 1
while next_seller_page is not None:
next_seller_page = scrap_items_page(next_seller_page, seller_main_page, current_page_number,
attributes_for_selection, attribute_values)
current_page_number += 1
def scrap_sellers(seller_links, tags_for_selection):
# to prevent the image link to be scrapped as a text attribute, an additional holder for the link is created
all_attributes = tags_for_selection + ["image_link"]
attribute_values = {}
init_attribute_holders(all_attributes, attribute_values)
# tags for selection doesn't include image. Image is processed separately
for link in seller_links:
scrap_init_seller_page(link, tags_for_selection, attribute_values)
return pd.DataFrame.from_dict(attribute_values)