-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweb_scraping.py
153 lines (123 loc) · 5.17 KB
/
web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager
# import chromedriver_autoinstaller
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.edge.service import Service as EdgeService
# from selenium.webdriver.common.by import By
from utils import write_to_file
# load environment variables
from dotenv import load_dotenv
load_dotenv()
OS_NAME = os.getenv('OS')
# TODO: what happens if .env variable is missing?
CHIPSET = os.getenv('CHIPSET')
# print(CHIPSET)
'''
The Selenium package is used to automate web browser interaction with python.
We will use it to open and scroll the webpage in an automated way.
Followed this article:
https://ankitmodi.github.io/intro-to-web-scraping-using-python-on-goodreads/
Make sure the correct chrome driver is installed https://chromedriver.chromium.org/
'''
WINDOW_SIZE = "1200,1200"
def get_driver_exact_location(browser_name=None):
if OS_NAME:
if OS_NAME == 'Windows_NT':
if browser_name == 'edge':
driver_path = 'edge_drivers/edgedriver_win64/msedgedriver.exe'
else:
driver_path = 'chrome_drivers/chromedriver_win32/chromedriver.exe'
elif OS_NAME == 'Linux':
driver_path = 'chrome_drivers/chromedriver_linux64/chromedriver'
elif OS_NAME == 'MacOS':
if CHIPSET == 'M1':
driver_path = 'edge_drivers/edgedriver_mac64_m1/msedgedriver'
else:
if browser_name == 'edge':
driver_path = 'edge_drivers/edgedriver_mac64/msedgedriver'
else:
driver_path = 'chrome_drivers/chromedriver_mac64/chromedriver'
else:
print(f'OS {OS_NAME} not supported')
return None
driver_path = f'web_drivers/{driver_path}'
return os.path.join(os.path.dirname(__file__), driver_path)
# no OS name found
return os.getenv('CHROME_DRIVER_PATH')
def build_driver(link:str=None, browser:str='', headless:bool=True):
if browser:
driver_path = get_driver_exact_location(browser)
# check for edge driver
# docs: https://learn.microsoft.com/en-us/microsoft-edge/webdriver-chromium/?tabs=c-sharp
try:
if browser == 'edge' or OS_NAME == 'Windows_NT':
# install and setup chromedriver
edge_options = EdgeOptions()
options = [
"inprivate",
"--window-size=%s" % WINDOW_SIZE,
"--log-level=3"
]
if headless:
options.append("--headless")
for option in options:
edge_options.add_argument(option)
driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()), options=edge_options)
elif browser == 'chrome':
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
if headless:
options.add_argument("--headless")
options.add_argument("--window-size=%s" % WINDOW_SIZE)
options.add_argument('--log-level=1')
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=options)
else:
# install and setup chromedriver
chrome_options = Options()
options = [
"--headless",
"--disable-gpu",
"--window-size=%s" % WINDOW_SIZE,
"--ignore-certificate-errors",
"--disable-extensions",
"--no-sandbox",
"--disable-dev-shm-usage"
]
for option in options:
chrome_options.add_argument(option)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
except Exception as e:
print(f'{browser} driver not found', e)
raise e
if link:
driver.get(link)
return driver
def get_html_using_selenium(url=None, driver=None, infinite_scroll=False):
if driver is None:
driver = build_driver()
if url:
print(f'Opening url: {url}')
driver.get(url)
if infinite_scroll:
# handle infinite scroll
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match = False
while(match is False):
lastCount = lenOfPage
time.sleep(3)
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount == lenOfPage:
match = True
# Page is fully scrolled now. Next step is to extract the source code from it.
my_html = driver.page_source
# write_to_file(my_html, './json/goodreads_html_2.html')
driver.quit()
return my_html