From 88eadb8790f457be71ac2e5e78c41537ab8cb23d Mon Sep 17 00:00:00 2001 From: Vincent Vatelot Date: Fri, 22 Mar 2024 09:35:06 +0100 Subject: [PATCH] fix(scraper): issue on video size, reproduce the bug (#67) * fix(scraper): issue on video size, reproduce the bug * fix(scrapper): remove content-length from har analysis * fix(scrapper): update scrapper get_size test * refact(scraper): Make some cleanup * refact(scraper): Add csv files to gitignore --------- Co-authored-by: PaulPHPE <93597135+PaulPHPE@users.noreply.github.com> --- .gitignore | 3 +- components/ecoindex/scraper/scrap.py | 11 +- development/scraper_test.py | 103 ++++++++++++++++-- poetry.lock | 57 +++++++++- pyproject.toml | 2 + .../ecoindex/scraper/test_scraper.py | 5 +- 6 files changed, 160 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 2accab8..c9015ff 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ dist *.webp *.sqlite3 .coverage -coverage.xml \ No newline at end of file +coverage.xml +*.csv \ No newline at end of file diff --git a/components/ecoindex/scraper/scrap.py b/components/ecoindex/scraper/scrap.py index b9b523b..ecbf231 100644 --- a/components/ecoindex/scraper/scrap.py +++ b/components/ecoindex/scraper/scrap.py @@ -139,12 +139,6 @@ async def get_nodes_count(self) -> int: def get_request_size(self, entry) -> int: if entry["response"]["_transferSize"] != -1: return entry["response"]["_transferSize"] - headers = entry["response"]["headers"] - content_length_header = list( - filter(lambda header: (header["name"].lower() == "content-length"), headers) - ) - if len(content_length_header) > 0 and entry["response"]["status"] == 206: - return int(content_length_header[0]["value"]) else: return len(json.dumps(entry["response"]).encode("utf-8")) @@ -156,7 +150,10 @@ async def check_page_response(self, response) -> None: message=response.status_text, ) headers = response.headers - content_type = next((value for key, value in headers.items() if key.lower() == 'content-type'), None) + content_type = next( + (value for key, value in headers.items() if key.lower() == "content-type"), + None, + ) if content_type and "text/html" not in content_type: raise TypeError( { diff --git a/development/scraper_test.py b/development/scraper_test.py index 68620e7..d0ba1b6 100644 --- a/development/scraper_test.py +++ b/development/scraper_test.py @@ -1,7 +1,10 @@ import asyncio from concurrent.futures import ThreadPoolExecutor, as_completed +import pandas as pd from ecoindex.scraper import EcoindexScraper +from haralyzer import HarParser +from slugify import slugify async def get_page_analysis(url: str): @@ -10,21 +13,23 @@ async def get_page_analysis(url: str): await scraper.get_page_analysis(), await scraper.get_all_requests(), await scraper.get_requests_by_category(), + scraper.har_temp_file_path, ) def run_page_analysis(url: str, index: int): - analysis, requests, aggregation = asyncio.run(get_page_analysis(url)) + analysis, requests, aggregation, har_file_path = asyncio.run(get_page_analysis(url)) - return index, analysis, requests, aggregation + return index, analysis, requests, aggregation, har_file_path with ThreadPoolExecutor(max_workers=8) as executor: future_to_analysis = {} - url = "https://www.ecoindex.fr" + urls = ["https://www.graphic-sud.com/", "https://federiconavarrete.com/"] + i = 0 - for i in range(1): + for url in urls: print(f"Starting ecoindex {i} analysis") future_to_analysis[ executor.submit( @@ -33,12 +38,94 @@ def run_page_analysis(url: str, index: int): i, ) ] = url + i += 1 for future in as_completed(future_to_analysis): try: - index, analysis, requests, aggregation = future.result() - print(f"Ecoindex {index}: {analysis}") - print(f"Requests: {requests}") - print(f"Aggregation: {aggregation}") + index, analysis, requests, aggregation, har_file_path = future.result() + + har_parser = HarParser.from_file(har_file_path) + for page in har_parser.pages: + haralyzer_data = [ + { + "type": "audio", + "count": len(page.audio_files), + "size": page.audio_size_trans, + }, + { + "type": "css", + "count": len(page.css_files), + "size": page.css_size_trans, + }, + { + "type": "javascript", + "count": len(page.js_files), + "size": page.js_size_trans, + }, + {"type": "page", "count": 1, "size": page.page_size_trans}, + { + "type": "image", + "count": len(page.image_files), + "size": page.image_size_trans, + }, + { + "type": "video", + "count": len(page.video_files), + "size": page.video_size_trans, + }, + { + "type": "other", + "count": len(page.text_files), + "size": page.text_size_trans, + }, + {"type": "html", "count": len(page.html_files), "size": None}, + { + "type": "total", + "count": len(page.entries), + "size": page.page_size_trans, + }, + ] + + df_haralyzer = pd.DataFrame( + haralyzer_data, columns=["type", "count", "size"] + ) + df_haralyzer["size"] = df_haralyzer["size"] / 1000 + + flatten_aggregation = [ + { + "type": type, + "count": item["total_count"], + "size": item["total_size"], + } + for type, item in aggregation.model_dump().items() + ] + flatten_aggregation.append( + { + "type": "total", + "count": analysis.requests, + "size": analysis.size * 1000, + } + ) + + df = pd.DataFrame(flatten_aggregation, columns=["type", "count", "size"]) + df["size"] = df["size"] / 1000 + + joinned_df = pd.merge( + df, + df_haralyzer, + on="type", + how="left", + suffixes=("_ecoindex", "_haralyzer"), + ) + + print() + print(page.url) + print(har_file_path) + print(df) + print(joinned_df) + print() + + joinned_df.to_csv(f"joinned_ecoindex_{slugify(page.url)}.csv", index=False) + except Exception as e: print(e) diff --git a/poetry.lock b/poetry.lock index 97817be..ecdf3f5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -203,6 +203,17 @@ d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "cached-property" +version = "1.5.2" +description = "A decorator for caching properties in classes." +optional = false +python-versions = "*" +files = [ + {file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"}, + {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"}, +] + [[package]] name = "caio" version = "0.9.13" @@ -965,6 +976,21 @@ files = [ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, ] +[[package]] +name = "haralyzer" +version = "2.4.0" +description = "A python framework for getting useful stuff out of HAR files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "haralyzer-2.4.0-py3-none-any.whl", hash = "sha256:b66d2bf873fc70d0288def5db8885ee005024f088cf745ef918beadafd2d7df2"}, + {file = "haralyzer-2.4.0.tar.gz", hash = "sha256:1154162a328a5226bc6d1d9626be19536ae049dd44b0a160081054f4808326a5"}, +] + +[package.dependencies] +cached-property = "*" +python-dateutil = "*" + [[package]] name = "hiredis" version = "2.3.2" @@ -2358,6 +2384,23 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "python-slugify" +version = "8.0.4" +description = "A Python slugify application that also handles Unicode" +optional = false +python-versions = ">=3.7" +files = [ + {file = "python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856"}, + {file = "python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8"}, +] + +[package.dependencies] +text-unidecode = ">=1.3" + +[package.extras] +unidecode = ["Unidecode (>=1.1.1)"] + [[package]] name = "pytz" version = "2024.1" @@ -2394,7 +2437,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2750,6 +2792,17 @@ anyio = ">=3.4.0,<5" [package.extras] full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"] +[[package]] +name = "text-unidecode" +version = "1.3" +description = "The most basic Text::Unidecode port" +optional = false +python-versions = "*" +files = [ + {file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"}, + {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, +] + [[package]] name = "tldextract" version = "5.1.1" @@ -3102,4 +3155,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "6a440d66a795a430dddcd9e96efe308ca2d55eaa9701e984bcf50203f12a480c" +content-hash = "1c3150961bed96d58e22f2445338ef1d7c645fb6acbf17b921a06ce5281989d2" diff --git a/pyproject.toml b/pyproject.toml index d82ac80..bb284ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,8 @@ python = ">=3.10,<3.13" redis = { extras = ["hiredis"], version = "^5.0.1" } requests = "^2.31.0" tomli = "^2.0.1" +haralyzer = "^2.4.0" +python-slugify = "^8.0.4" [tool.poetry.group.scraper.dependencies] pillow = "^10.1.0" diff --git a/test/components/ecoindex/scraper/test_scraper.py b/test/components/ecoindex/scraper/test_scraper.py index b7fcc09..488939d 100644 --- a/test/components/ecoindex/scraper/test_scraper.py +++ b/test/components/ecoindex/scraper/test_scraper.py @@ -121,10 +121,9 @@ def test_get_request_size(): assert scraper.get_request_size(mock_stripped_har_entry[1]) == len( json.dumps(mock_stripped_har_entry[1]["response"]).encode("utf-8") ) - assert scraper.get_request_size(mock_stripped_har_entry[1]) == len( - json.dumps(mock_stripped_har_entry[1]["response"]).encode("utf-8") + assert scraper.get_request_size(mock_stripped_har_entry[2]) == len( + json.dumps(mock_stripped_har_entry[2]["response"]).encode("utf-8") ) - assert scraper.get_request_size(mock_stripped_har_entry[2]) == 7347 async def test_check_page_response():