Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Blibli #15

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions marketplace_scraper/scrapper/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,71 @@ def get_store_follower(self):
store_follower_div = self.soup.find_all('span',{'class':'zw2E3N'})
store_follower = store_follower_div[5].text.strip() if len(store_follower_div) > 0 else None
return store_follower

class blibliUtilities:
def __init__(self, driver):
# soup: html elements from product page
self.driver = driver

def get_linkproduct(driver, items):
listLinkofproduct = []
for findlink in Product_info_list:
link = findlink.find_element_by_tag_name('a')
listLinkofproduct.append(link.get_property('href'))
return listLinkofproduct

def get_nameproduct(driver, items):
listNameofproduct = []
for findname in Product_info_list:
name = findname.find_element_by_xpath('//div[@class="product__content"]/div').text
listNameofproduct.append(name)
return listNameofproduct

def get_ratingproduct(driver, items):
listRatingofproduct = []
for findrating in Product_info_list:
rating = findrating.find_element_by_xpath('//span[@class="product__body__rating__stars__rating"]').text
listRatingofproduct.append(rating)
return listLinkofproduct

def get_soldproduct(driver, items):
listSoldofproduct = []
for findsold in Product_info_list:
sold = findsold.find_element_by_xpath('//span[@class="product__body__rating__sold__count"]').text
listSoldofproduct.append(sold)
return listSoldofproduct

def get_priceproduct(driver, items):
listPriceofproduct = []
for findprice in Product_info_list:
price = findprice.find_element_by_xpath('//strong[@class="product__body__price__display"]').text
listPriceofproduct.append(price)
return listPriceofproduct

def get_locationproduct(driver, items):
listLocationofproduct = []
for findlocation in Product_info_list:
location = findlocation.find_element_by_xpath('//span[@class="product__body__location__text"]').text
listLocationofproduct.append(location)
return listLocationofproduct

def get_picture_search(driver, items):
listPictureofproduct = []
for findpic in Product_info_list:
picture = findpic.find_element_by_xpath('//div[@class="product__itemImage product__image__grid-view"]/div/div/img').get_attribute("src")
listLocationofproduct.append(picture)
return listPictureofproduct

def get_spec(driver):
prod_specs = driver.find_elements_by_xpath('//div[@class="product-features"]/div/ul/li')
spec = []
for sp in prod_specs:
spec.append(sp.text)
return spec

def get_picture_detail(driver):
picture = driver.find_element_by_xpath('//div[@class="thumbnail-area"]')
text = picture.get_attribute('style')
link_regex = re.compile("((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)", re.DOTALL)
link = re.findall(link_regex, text)
return link[0][0]
124 changes: 122 additions & 2 deletions marketplace_scraper/scrapper/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,128 @@ def scrolling(driver):
time.sleep(1)
time.sleep(2)

def _blibli_handler(driver, **query):
pass
def _blibli_handler_searchpage(driver, **query):

blibli_utilities = blibliUtilities(driver) # initiate blibli utilities class
defined_query = query
link_factory = linkFactory(defined_query)
print('[INFO] creating url from user query ...')
url_ref = link_factory._blibli_link_factory()
print('[INFO] url_ref:', url_ref)
driver.get(f'{url_ref}')
print('[INFO] In search page ...')

# waiting for element to appear
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//div[@class="product__item"]')))
Product_info_list = driver.find_elements_by_class_name('product__item')
# start scraping data using blibliutilities
productslink = blibli_utilities.get_linkproduct(driver,Product_info_list)
print("[INFO] recorded product num: {}".format(len(productslink)))
productsname = blibli_utilities.get_nameproduct(driver,Product_info_list)
productsrating = blibli_utilities.get_ratingproduct(driver,Product_info_list)
productssold = blibli_utilities.get_soldproduct(driver,Product_info_list)
productsprice = blibli_utilities.get_priceproduct(driver,Product_info_list)
productslocation = blibli_utilities.get_locationproduct(driver,Product_info_list)
productspicture = blibli_utilities.get_picture_search(driver,Product_info_list)

result = pd.DataFrame(zip(productsname,productsrating,productssold,productsprice,productslocation,productslink,productspicture),
columns= ["Name", "Rating", "Sold", "Price", "Location", "Link", "Picture"])
return result

def _blibli_handler_detailed(driver, **query):
DataProduct = []
blibli_utilities = blibliUtilities(driver) # initiate blibli utilities class
defined_query = query
link_factory = linkFactory(defined_query)

print('[INFO] creating url from user query ...')
url_ref = link_factory._blibli_link_factory()
print('[INFO] url_ref:', url_ref)
driver.get(f'{url_ref}')
print('[INFO] In search page ...')

# waiting for element to appear
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//div[@class="product__item"]')))
Product_info_list = driver.find_elements_by_class_name('product__item')
# saving each product
productslink = blibli_utilities.get_linkproduct(driver,Product_info_list)
print("[INFO] recorded product num: {}".format(len(productslink)))

# start scraping
for link in tqdm(productslink):
driver.get(link)
# initiate element with null
name = rating = review = sold = price = category = spec = picture = sname = sresponse = slocation = followers = srating = None
scrolling(driver)
# scrape from products
try:
name = driver.find_element_by_xpath('//div[@id="product-info"]/div[2]/div[1]').text
except NoSuchElementException:
pass
try:
rating = driver.find_element_by_xpath('//div[@class="review-summary__average"]/div[1]').text
except NoSuchElementException:
pass
try:
review = driver.find_element_by_xpath('//*[@id="product-review"]/div[2]/div[1]/div[1]/div[1]/div[1]/span').text
except NoSuchElementException:
pass
try:
sold = driver.find_element_by_xpath('//*[@id="product-info"]/div[2]/div[2]/span[2]').text
except NoSuchElementException: #spelling error making this code not work as expected
pass
try:
price = driver.find_element_by_xpath('//*[@class="final-price"]/span').text
except NoSuchElementException:
pass
try:
category = driver.find_element_by_xpath('//*[@class="category"]/a').text
except NoSuchElementException:
pass
try:
spec = get_spec(driver)
except NoSuchElementException:
pass
try:
picture = get_picture_detail(driver)
except NoSuchElementException:
pass
try:
sname = driver.find_element_by_xpath('//div[@class="seller__name"]/a').text
except NoSuchElementException:
pass
try:
sresponse = driver.find_element_by_xpath('//div[@class="seller-rating__active-response"]/div[1]/span').text
except NoSuchElementException:
pass
try:
slocation = driver.find_element_by_xpath('//div[@class="seller-info__location"]/div[2]/span').text
except NoSuchElementException:
pass
# scrape from shop (using clicks)
driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
driver.find_element_by_xpath('//button[@class="ticker__close b-order-right"]').click() # close pop up
driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME) # twice for load more element
driver.find_element_by_xpath('//div[@class="seller__name"]/a').click()
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//p[@class="followers-count"]')))
try:
followers = driver.find_element_by_xpath('//p[@class="followers-count"]').text
except NoSuchElementException:
pass
try:
srating = driver.find_element_by_xpath('//span[@class="rating-merchant-value no-rating"]/div/span').text
except NoSuchElementException:
pass

listofitems = {
'Product name':name,'Product Rating':rating,'Total Review':review,
'Total Sold': sold,'Price':price,'Product Category':category,
'Product Specs':spec,'Product Picture':picture,'Store Name':sname,'Store Response Duration':sresponse,
'Store Location':slocation,'Product Link': link,'Followers': followers,'Store Rating': srating
}
DataProduct.append(listofitems)
result = pd.DataFrame(DataProduct)
return result

def _bukalapak_handler(driver, **query):

Expand Down