-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap_a_category.py
114 lines (91 loc) · 3.38 KB
/
scrap_a_category.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import re
import csv
from datetime import date
from tools import tools
url = "https://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html"
books = []
def extract_book_data(soup, url):
"""
Send a book page to return all the info you need about it
:send_book_page:
:send_clear_url:
"""
print("Scrap d'un livre")
items = {}
trs = soup.find_all('tr')
for tr in trs:
th = tr.find('th')
item_name = th.string
td = tr.find('td')
item_value = td.string
items[item_name] = item_value
product_page_url = url
universal_product_code = items['UPC']
title = soup.find('h1').string
price_including_tax = items['Price (incl. tax)']
price_excluding_tax = items['Price (excl. tax)']
number_available = tools.transform_stock(items['Availability'])
if soup.find('div', id="product_description"):
product_description_div = soup.find('div', id="product_description")
product_description = product_description_div.find_next_sibling('p').string
else:
product_description = ""
category_list = soup.find('ul', class_='breadcrumb')
category = category_list.find_all('a')[-1].string
rate = soup.find('p', class_=re.compile(r'star-rating'))
rate_classes = rate.get('class')
review_rating = rate_classes[-1]
image_url = soup.find('img').get('src')
book_data_dict = {
'product_page_url': product_page_url,
'universal_product_code': universal_product_code,
'title': title,
'price_including_tax': price_including_tax,
'price_excluding_tax': price_excluding_tax,
'number_available': number_available,
'product_description': product_description,
'category': category,
'review_rating': review_rating,
'image_url': image_url
}
print("Scrap de " + book_data_dict['title'] + " terminé.")
books.append(book_data_dict)
soup_page = tools.scrap(url)
books_links = []
def scrap_links_in_page(url_link):
soup = tools.scrap(url_link)
articles = soup.find_all('article', class_='product_pod')
for article in articles:
title_link = article.find('h3')
link = title_link.find('a').get('href')
complete_link = tools.transform_url(url_link, link)
books_links.append(complete_link)
scrap_links_in_page(url)
new_extend_link = 'index.html'
new_link = url
while tools.detect_pages(new_link):
soup = tools.scrap(new_link)
print("On cherche le lien")
li = soup.find('li', class_="next")
print("Le lien est bon")
a = li.find('a').get('href')
print("Lien :" + str(a))
print("extension :" + new_extend_link)
new_link = new_link.replace(new_extend_link, a)
new_extend_link = a
print("Nouvelle extension de lien : " + new_extend_link)
print("Nouveau lien : " + new_link)
scrap_links_in_page(new_link)
for book in books_links:
extract_book_data(tools.scrap(book), book)
# CSV part
print("Génération d'un fichier excel")
header = books[0].keys()
today = str(date.today())
with open(books[0]['category'].replace(" ", "_") + "_" + today + "_data.csv", "w", newline='', encoding='utf-8-sig') as file_csv:
writer = csv.writer(file_csv, delimiter=",")
writer.writerow(header)
for book in books:
line = book.values()
writer.writerow(line)
print("Votre fichier Excel est prêt")