scrap_a_category.py

import re
import csv
from datetime import date
from tools import tools


url = "https://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html"


books = []

def extract_book_data(soup, url):
    """
           Send a book page to return all the info you need about it
           :send_book_page:
           :send_clear_url:
           """
    print("Scrap d'un livre")
    items = {}
    trs = soup.find_all('tr')
    for tr in trs:
        th = tr.find('th')
        item_name = th.string
        td = tr.find('td')
        item_value = td.string
        items[item_name] = item_value
    product_page_url = url
    universal_product_code = items['UPC']
    title = soup.find('h1').string
    price_including_tax = items['Price (incl. tax)']
    price_excluding_tax = items['Price (excl. tax)']
    number_available = tools.transform_stock(items['Availability'])
    if soup.find('div', id="product_description"):
        product_description_div = soup.find('div', id="product_description")
        product_description = product_description_div.find_next_sibling('p').string
    else:
        product_description = ""
    category_list = soup.find('ul', class_='breadcrumb')
    category = category_list.find_all('a')[-1].string
    rate = soup.find('p', class_=re.compile(r'star-rating'))
    rate_classes = rate.get('class')
    review_rating = rate_classes[-1]
    image_url = soup.find('img').get('src')
    book_data_dict = {
        'product_page_url': product_page_url,
        'universal_product_code': universal_product_code,
        'title': title,
        'price_including_tax': price_including_tax,
        'price_excluding_tax': price_excluding_tax,
        'number_available': number_available,
        'product_description': product_description,
        'category': category,
        'review_rating': review_rating,
        'image_url': image_url
    }
    print("Scrap de " + book_data_dict['title'] + " terminé.")
    books.append(book_data_dict)


soup_page = tools.scrap(url)

books_links = []


def scrap_links_in_page(url_link):
    soup = tools.scrap(url_link)
    articles = soup.find_all('article', class_='product_pod')
    for article in articles:
        title_link = article.find('h3')
        link = title_link.find('a').get('href')
        complete_link = tools.transform_url(url_link, link)
        books_links.append(complete_link)


scrap_links_in_page(url)


new_extend_link = 'index.html'
new_link = url


while tools.detect_pages(new_link):
    soup = tools.scrap(new_link)
    print("On cherche le lien")
    li = soup.find('li', class_="next")
    print("Le lien est bon")
    a = li.find('a').get('href')
    print("Lien :" + str(a))
    print("extension :" + new_extend_link)
    new_link = new_link.replace(new_extend_link, a)
    new_extend_link = a
    print("Nouvelle extension de lien : " + new_extend_link)
    print("Nouveau lien : "  + new_link)
    scrap_links_in_page(new_link)

for book in books_links:
    extract_book_data(tools.scrap(book), book)

# CSV part
print("Génération d'un fichier excel")
header = books[0].keys()
today = str(date.today())

with open(books[0]['category'].replace(" ", "_") + "_" + today + "_data.csv", "w", newline='', encoding='utf-8-sig') as file_csv:
    writer = csv.writer(file_csv, delimiter=",")
    writer.writerow(header)
    for book in books:
        line = book.values()
        writer.writerow(line)

    print("Votre fichier Excel est prêt")