Skip to content

Commit

Permalink
fix(scraper): issue on video size, reproduce the bug (#67)
Browse files Browse the repository at this point in the history
* fix(scraper): issue on video size, reproduce the bug

* fix(scrapper): remove content-length from har analysis

* fix(scrapper): update scrapper get_size test

* refact(scraper): Make some cleanup

* refact(scraper): Add csv files to gitignore

---------

Co-authored-by: PaulPHPE <[email protected]>
  • Loading branch information
vvatelot and PaulPHPE authored Mar 22, 2024
1 parent 39e9446 commit 88eadb8
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 21 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ dist
*.webp
*.sqlite3
.coverage
coverage.xml
coverage.xml
*.csv
11 changes: 4 additions & 7 deletions components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,6 @@ async def get_nodes_count(self) -> int:
def get_request_size(self, entry) -> int:
if entry["response"]["_transferSize"] != -1:
return entry["response"]["_transferSize"]
headers = entry["response"]["headers"]
content_length_header = list(
filter(lambda header: (header["name"].lower() == "content-length"), headers)
)
if len(content_length_header) > 0 and entry["response"]["status"] == 206:
return int(content_length_header[0]["value"])
else:
return len(json.dumps(entry["response"]).encode("utf-8"))

Expand All @@ -156,7 +150,10 @@ async def check_page_response(self, response) -> None:
message=response.status_text,
)
headers = response.headers
content_type = next((value for key, value in headers.items() if key.lower() == 'content-type'), None)
content_type = next(
(value for key, value in headers.items() if key.lower() == "content-type"),
None,
)
if content_type and "text/html" not in content_type:
raise TypeError(
{
Expand Down
103 changes: 95 additions & 8 deletions development/scraper_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
from ecoindex.scraper import EcoindexScraper
from haralyzer import HarParser
from slugify import slugify


async def get_page_analysis(url: str):
Expand All @@ -10,21 +13,23 @@ async def get_page_analysis(url: str):
await scraper.get_page_analysis(),
await scraper.get_all_requests(),
await scraper.get_requests_by_category(),
scraper.har_temp_file_path,
)


def run_page_analysis(url: str, index: int):
analysis, requests, aggregation = asyncio.run(get_page_analysis(url))
analysis, requests, aggregation, har_file_path = asyncio.run(get_page_analysis(url))

return index, analysis, requests, aggregation
return index, analysis, requests, aggregation, har_file_path


with ThreadPoolExecutor(max_workers=8) as executor:
future_to_analysis = {}

url = "https://www.ecoindex.fr"
urls = ["https://www.graphic-sud.com/", "https://federiconavarrete.com/"]
i = 0

for i in range(1):
for url in urls:
print(f"Starting ecoindex {i} analysis")
future_to_analysis[
executor.submit(
Expand All @@ -33,12 +38,94 @@ def run_page_analysis(url: str, index: int):
i,
)
] = url
i += 1

for future in as_completed(future_to_analysis):
try:
index, analysis, requests, aggregation = future.result()
print(f"Ecoindex {index}: {analysis}")
print(f"Requests: {requests}")
print(f"Aggregation: {aggregation}")
index, analysis, requests, aggregation, har_file_path = future.result()

har_parser = HarParser.from_file(har_file_path)
for page in har_parser.pages:
haralyzer_data = [
{
"type": "audio",
"count": len(page.audio_files),
"size": page.audio_size_trans,
},
{
"type": "css",
"count": len(page.css_files),
"size": page.css_size_trans,
},
{
"type": "javascript",
"count": len(page.js_files),
"size": page.js_size_trans,
},
{"type": "page", "count": 1, "size": page.page_size_trans},
{
"type": "image",
"count": len(page.image_files),
"size": page.image_size_trans,
},
{
"type": "video",
"count": len(page.video_files),
"size": page.video_size_trans,
},
{
"type": "other",
"count": len(page.text_files),
"size": page.text_size_trans,
},
{"type": "html", "count": len(page.html_files), "size": None},
{
"type": "total",
"count": len(page.entries),
"size": page.page_size_trans,
},
]

df_haralyzer = pd.DataFrame(
haralyzer_data, columns=["type", "count", "size"]
)
df_haralyzer["size"] = df_haralyzer["size"] / 1000

flatten_aggregation = [
{
"type": type,
"count": item["total_count"],
"size": item["total_size"],
}
for type, item in aggregation.model_dump().items()
]
flatten_aggregation.append(
{
"type": "total",
"count": analysis.requests,
"size": analysis.size * 1000,
}
)

df = pd.DataFrame(flatten_aggregation, columns=["type", "count", "size"])
df["size"] = df["size"] / 1000

joinned_df = pd.merge(
df,
df_haralyzer,
on="type",
how="left",
suffixes=("_ecoindex", "_haralyzer"),
)

print()
print(page.url)
print(har_file_path)
print(df)
print(joinned_df)
print()

joinned_df.to_csv(f"joinned_ecoindex_{slugify(page.url)}.csv", index=False)

except Exception as e:
print(e)
57 changes: 55 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ python = ">=3.10,<3.13"
redis = { extras = ["hiredis"], version = "^5.0.1" }
requests = "^2.31.0"
tomli = "^2.0.1"
haralyzer = "^2.4.0"
python-slugify = "^8.0.4"

[tool.poetry.group.scraper.dependencies]
pillow = "^10.1.0"
Expand Down
5 changes: 2 additions & 3 deletions test/components/ecoindex/scraper/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,9 @@ def test_get_request_size():
assert scraper.get_request_size(mock_stripped_har_entry[1]) == len(
json.dumps(mock_stripped_har_entry[1]["response"]).encode("utf-8")
)
assert scraper.get_request_size(mock_stripped_har_entry[1]) == len(
json.dumps(mock_stripped_har_entry[1]["response"]).encode("utf-8")
assert scraper.get_request_size(mock_stripped_har_entry[2]) == len(
json.dumps(mock_stripped_har_entry[2]["response"]).encode("utf-8")
)
assert scraper.get_request_size(mock_stripped_har_entry[2]) == 7347


async def test_check_page_response():
Expand Down

1 comment on commit 88eadb8

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage PR

Branch coverage •
FileStmtsMissCoverMissing
components/ecoindex/scraper
   scrap.py915143%45, 48–49, 51, 60, 63, 66–68, 73–75, 77–81, 84–87, 89, 91, 98–101, 108–110, 112–121, 130–131, 134–135, 137, 146–147, 152–153, 157–158
TOTAL69523066% 

Please sign in to comment.