Skip to content
This repository has been archived by the owner on Jan 25, 2024. It is now read-only.

Commit

Permalink
♻️ refact(driver): Remove conflicting driver quit method from scraper (
Browse files Browse the repository at this point in the history
  • Loading branch information
vvatelot authored Sep 11, 2023
1 parent ae54b4f commit 1e3cd50
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 123 deletions.
14 changes: 7 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# Build image
FROM python:3.11-slim AS requirements-stage

ARG CHROME_VERSION_MAIN=107
ARG CHROME_VERSION_MAIN=108
ENV CHROME_VERSION_MAIN=${CHROME_VERSION_MAIN}
ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver

WORKDIR /tmp

Expand All @@ -30,6 +29,7 @@ ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver

WORKDIR /code
ENV PYTHONPATH "/code"
ENV CHROME_EXECUTABLE_PATH "/opt/chrome/chrome"

RUN apt update && apt install -y ca-certificates fonts-liberation \
libappindicator3-1 libasound2 libatk-bridge2.0-0 \
Expand All @@ -43,13 +43,13 @@ RUN apt update && apt install -y ca-certificates fonts-liberation \

# Copy requirements.txt, chromedriver, chrome from requirements-stage
COPY --from=requirements-stage /tmp/ /tmp/
COPY --from=requirements-stage /tmp/chromedriver /usr/bin/chromedriver
COPY --from=requirements-stage /tmp/chromedriver ${CHROMEDRIVER_PATH}
COPY --from=requirements-stage /tmp/chrome /opt/chrome


# Install google chrome and make chromedriver executable
RUN chmod +x /usr/bin/chromedriver
RUN chmod +x ${CHROMEDRIVER_PATH}

COPY ./ /code/
# Clean up
RUN rm -rf /tmp/dist /var/lib/{apt,dpkg,cache,log}/

RUN pip install -r /tmp/requirements.txt
COPY ./ /code/
151 changes: 83 additions & 68 deletions ecoindex_scraper/scrap.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import asyncio
from datetime import datetime
from json import loads
from os import chmod, remove
from shutil import copyfile
from time import sleep
Expand All @@ -12,6 +12,7 @@
from genericpath import exists
from pydantic.networks import HttpUrl
from selenium.common.exceptions import JavascriptException, NoSuchElementException
from selenium.webdriver import DesiredCapabilities

from ecoindex_scraper.utils import convert_screenshot_to_webp, set_screenshot_rights

Expand Down Expand Up @@ -43,12 +44,17 @@ def __init__(
self.page_load_timeout = page_load_timeout

self.chrome_options = uc.ChromeOptions()
self.chrome_options.headless = True
self.chrome_options.add_argument(f"--window-size={self.window_size}")
self.chrome_options.add_argument("--no-sandbox")
self.chrome_options.add_argument("--disable-dev-shm-usage")
self.chrome_options.add_argument("--ignore-certificate-errors")
self.chrome_options.add_argument("--headless=new")

self.capbs = DesiredCapabilities.CHROME.copy()

self.capbs["goog:loggingPrefs"] = {"performance": "ALL"} # type: ignore

self.all_requests = {}
self.page_response = False

Expand All @@ -63,86 +69,53 @@ def __del__(self):
if self.driver_executable_path and exists(self.driver_executable_path):
remove(self.driver_executable_path)

if hasattr(self, "driver"):
self.driver.quit()

def _handle_network_response_received(self, eventdata):
if eventdata["params"]["response"]["url"].startswith("http"):
self.all_requests[eventdata["params"]["requestId"]] = {
"url": eventdata["params"]["response"]["url"],
"size": 0,
"type": eventdata["params"]["type"],
}

if not self.page_response:
self.page_response = True
asyncio.run(self.check_page_response(eventdata["params"]["response"]))

def _handle_network_data_received(self, eventdata):
if eventdata["params"]["requestId"] in self.all_requests:
self.all_requests[eventdata["params"]["requestId"]]["size"] += eventdata[
"params"
]["encodedDataLength"]

def _handle_network_loading_finished(self, eventdata):
if eventdata["params"]["requestId"] in self.all_requests:
self.all_requests[eventdata["params"]["requestId"]]["size"] = eventdata[
"params"
]["encodedDataLength"]

def init_chromedriver(self):
self.driver = uc.Chrome(
options=self.chrome_options,
version_main=self.chrome_version_main,
driver_executable_path=self.driver_executable_path,
browser_executable_path=self.chrome_executable_path,
enable_cdp_events=True,
)

self.driver.add_cdp_listener(
"Network.dataReceived", self._handle_network_data_received
)
self.driver.add_cdp_listener(
"Network.responseReceived", self._handle_network_response_received
)
self.driver.add_cdp_listener(
"Network.loadingFinished", self._handle_network_loading_finished
)
try:
self.driver = uc.Chrome(
options=self.chrome_options,
desired_capabilities=self.capbs,
version_main=self.chrome_version_main,
driver_executable_path=self.driver_executable_path,
browser_executable_path=self.chrome_executable_path,
)

if self.page_load_timeout is not None:
self.driver.set_page_load_timeout(float(self.page_load_timeout))
if self.page_load_timeout is not None:
self.driver.set_page_load_timeout(float(self.page_load_timeout))

return self
return self
except Exception as e:
self.__del__()
raise e

async def get_page_analysis(
self,
) -> Result:
try:
page_metrics, page_type = await self.scrap_page()
ecoindex = await get_ecoindex(
dom=page_metrics.nodes,
size=page_metrics.size,
requests=page_metrics.requests,
)

return Result(
score=ecoindex.score,
ges=ecoindex.ges,
water=ecoindex.water,
grade=ecoindex.grade,
url=self.url,
date=datetime.now(),
width=self.window_size.width,
height=self.window_size.height,
nodes=page_metrics.nodes,
size=page_metrics.size,
requests=page_metrics.requests,
page_type=page_type,
)
except Exception as e:
self.__del__()
raise e

ecoindex = await get_ecoindex(
dom=page_metrics.nodes,
size=page_metrics.size,
requests=page_metrics.requests,
)
return Result(
score=ecoindex.score,
ges=ecoindex.ges,
water=ecoindex.water,
grade=ecoindex.grade,
url=self.url,
date=datetime.now(),
width=self.window_size.width,
height=self.window_size.height,
nodes=page_metrics.nodes,
size=page_metrics.size,
requests=page_metrics.requests,
page_type=page_type,
)

async def scrap_page(self) -> Tuple[PageMetrics, PageType | None]:
self.driver.set_script_timeout(10)
self.driver.get(self.url)
Expand Down Expand Up @@ -182,6 +155,7 @@ async def scroll_to_bottom(self) -> None:
async def get_page_metrics(self) -> PageMetrics:
nodes = self.driver.find_elements("xpath", "//*")
nb_svg_children = await self.get_svg_children_count()
await self.get_all_requests()

downloaded_data = [request["size"] for request in self.all_requests.values()]

Expand All @@ -191,6 +165,47 @@ async def get_page_metrics(self) -> PageMetrics:
requests=len(self.all_requests),
)

async def get_all_requests(self) -> None:
performance_logs = self.driver.get_log("performance")

for log in performance_logs:
message = loads(log["message"])

if (
"INFO" == log["level"]
and "Network.responseReceived" == message["message"]["method"]
and message["message"]["params"]["response"]["url"].startswith("http")
):
self.all_requests[message["message"]["params"]["requestId"]] = {
"url": message["message"]["params"]["response"]["url"],
"size": 0,
"type": message["message"]["params"]["type"],
}

if not self.page_response:
self.page_response = True
await self.check_page_response(
message["message"]["params"]["response"]
)

if (
"INFO" == log["level"]
and "Network.dataReceived" == message["message"]["method"]
and message["message"]["params"]["requestId"] in self.all_requests
):
self.all_requests[message["message"]["params"]["requestId"]][
"size"
] += message["message"]["params"]["encodedDataLength"]

if (
"INFO" == log["level"]
and "Network.loadingFinished" == message["message"]["method"]
and message["message"]["params"]["requestId"] in self.all_requests
):
self.all_requests[message["message"]["params"]["requestId"]][
"size"
] = message["message"]["params"]["encodedDataLength"]

@staticmethod
async def check_page_response(response: Dict) -> None:
if response["mimeType"] != "text/html":
Expand Down
Loading

0 comments on commit 1e3cd50

Please sign in to comment.