-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvegetables.py
150 lines (123 loc) · 5.76 KB
/
vegetables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import csv
import datetime
import os
import time
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# Set up logging
log_folder = "logs/vegetables"
os.makedirs(log_folder, exist_ok=True)
log_filename = os.path.join(log_folder, f"scraping_log_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt")
logging.basicConfig(filename=log_filename, level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Firefox and Geckodriver configuration
service = Service("/usr/local/bin/geckodriver")
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(service=service, options=options)
# List to store results
data_list = []
# Function to check if there is a "next" button (next page)
def has_next_page():
try:
# Check if there is a "next" button for the next page
next_button = driver.find_element(By.CSS_SELECTOR, "a.bucket-pagination__icon.bucket-pagination__icon--next")
return True
except NoSuchElementException:
# If the button does not exist, it means we are on the last page
return False
# Function to scrape a single page
def scrape_page(page_url):
driver.get(page_url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
try:
# Wait for product elements to load (specifically wait for at least 1 product on the page)
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, "product-grid__item"))
)
except TimeoutException:
logging.error(f"The waiting time for loading the page {page_url} has expired.")
return False
# Get the products on the page
products = driver.find_elements(By.CLASS_NAME, "product-grid__item")
if not products:
logging.warning(f"No products found on the page {page_url}. Ending scraping..")
return False
logging.info(f"Found {len(products)} products on the page {page_url}.")
for product in products:
try:
# Use XPath to more precisely locate the elements
name = product.find_element(By.XPATH,
'.//div[contains(@class, "product-tile__name") and @itemprop="name"]').text.strip() if product.find_element(
By.XPATH, './/div[contains(@class, "product-tile__name") and @itemprop="name"]') else "Brak nazwy"
packaging_details = product.find_element(By.XPATH,
'.//div[contains(@class, "packaging-details")]').text.strip() if product.find_element(
By.XPATH, './/div[contains(@class, "packaging-details")]') else "Brak masy"
logging.debug(f"Checking product: {name} - {packaging_details}")
try:
# Checking for promotion
promotion = product.find_element(By.CLASS_NAME,
"price-tile__label").text.strip() if product.find_element(
By.CLASS_NAME, "price-tile__label") else "Brak promocji"
except NoSuchElementException:
promotion = "Brak promocji"
try:
# Checking promotion date
promo_dates = product.find_element(By.CLASS_NAME,
"product-details__info-item").text.strip() if product.find_element(
By.CLASS_NAME, "product-details__info-item") else "Brak daty promocji"
except NoSuchElementException:
promo_dates = "Brak daty promocji"
try:
# Get product URL
product_url = product.find_element(By.CLASS_NAME, "js-product-link").get_attribute(
"href").strip() if product.find_element(By.CLASS_NAME, "js-product-link") else "Brak URL"
except NoSuchElementException:
product_url = "Brak URL"
# Add data to the results list
data_list.append({
"Nazwa": name,
"Masa": packaging_details,
"Promocja": promotion,
"Data promocji": promo_dates,
"Data pobrania": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"URL": product_url
})
except Exception as e:
logging.error(f"Problem with retrieving product data.: {e}")
return True
# Starting URL
base_url = "https://zakupy.biedronka.pl/warzywa"
# Start scraping from page 1
page_number = 1
while True:
logging.info(f"Scraping page {page_number}...")
page_url = f"{base_url}/?page={page_number}"
# Scrape the page
if not scrape_page(page_url):
break
# Check if there is a "next" button for the next page
if not has_next_page():
logging.info("Last page reached.")
break
# Increase the page number
page_number += 1
# Close the browser
driver.quit()
# Create folder for results
output_folder = "results/vegetables"
os.makedirs(output_folder, exist_ok=True)
# Create a unique filename for the CSV file
current_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
csv_filename = os.path.join(output_folder, f"vegetables_{current_timestamp}.csv")
# Save to CSV file
with open(csv_filename, mode="w", encoding="utf-8", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["Nazwa", "Masa", "Promocja", "Data promocji", "Data pobrania", "URL"])
writer.writeheader()
writer.writerows(data_list)
logging.info(f"Results saved in file: {csv_filename}")