From 86b293678065e251f82e05a9a3f644fc8791c286 Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Wed, 7 Aug 2024 17:15:09 +0200 Subject: [PATCH 1/4] WIP --- checks/remotesettings/attachments_bundles.py | 102 +++++++++++++++++++ checks/remotesettings/utils.py | 4 + telescope/utils.py | 5 + 3 files changed, 111 insertions(+) create mode 100644 checks/remotesettings/attachments_bundles.py diff --git a/checks/remotesettings/attachments_bundles.py b/checks/remotesettings/attachments_bundles.py new file mode 100644 index 00000000..f6064c65 --- /dev/null +++ b/checks/remotesettings/attachments_bundles.py @@ -0,0 +1,102 @@ +""" +Verify freshness and validity of attachment bundles. + +For each collection where the attachments bundle is enable, return the modification timestamp and number of attachments bundled. +""" + +import io +import logging +import urllib.parse +import zipfile +from telescope.typings import CheckResult +from telescope.utils import ClientSession, retry_decorator, run_parallel, utcfromhttpdate, utcfromtimestamp + +from .utils import KintoClient, fetch_signed_resources + + +EXPOSED_PARAMETERS = ["server"] + +logger = logging.getLogger(__name__) + + +@retry_decorator +async def fetch_binary(url: str, **kwargs) -> bytes: + human_url = urllib.parse.unquote(url) + logger.debug(f"Fetch binary from '{human_url}'") + async with ClientSession() as session: + async with session.get(url, **kwargs) as response: + return (response.status, response.headers["Last-Modified"], await response.read()) + + +async def run(server: str, auth: str, margin_publication_days: int = 1) -> CheckResult: + client = KintoClient(server_url=server, auth=auth) + resources = await fetch_signed_resources(server, auth) + + resources = [r for r in resources if r["source"]["collection"] in ("intermediates",)] + + logger.debug("Fetch metadata of %s collections", len(resources)) + futures = [ + client.get_collection( + bucket=resource["source"]["bucket"], + id=resource["source"]["collection"], + ) + for resource in resources + ] + sources_metadata = await run_parallel(*futures) + resources_sources_metadata = zip(resources, sources_metadata) + + metadata_for_bundled = [(r, m) for r, m in resources_sources_metadata if m["data"].get("attachment", {}).get("bundle", False)] + logger.info("%s collections with attachments bundle", len(metadata_for_bundled)) + + info = await client.server_info() + base_url = info["capabilities"]["attachments"]["base_url"] + + futures = [] + for resource, metadata in metadata_for_bundled: + bid = resource["destination"]["bucket"] + cid = metadata["data"]["id"] + url = f"{base_url}bundles/{bid}--{cid}.zip" + futures.append(fetch_binary(url, raise_for_status=True)) + bundles = await run_parallel(*futures) + + futures = [] + for resource, _ in metadata_for_bundled: + futures.append(client.get_records_timestamp(bucket=resource["destination"]["bucket"], collection=resource["destination"]["collection"])) + records_timestamps = await run_parallel(*futures) + + timestamps_metadata_bundles = zip(records_timestamps, metadata_for_bundled, bundles) + + result = {} + success = True + for timestamp, (resource, metadata), bundle in timestamps_metadata_bundles: + status, modified, binary = bundle + bid = resource["destination"]["bucket"] + cid = metadata["data"]["id"] + if status >= 400: + result[f"{bid}/{cid}"] = "missing" + success = False + continue + + try: + z = zipfile.ZipFile(io.BytesIO(binary)) + nfiles = len(z.namelist()) + except zipfile.BadZipFile: + result[f"{bid}/{cid}"] = "bad zip" + success = False + continue + + bundle_ts = utcfromhttpdate(modified) + records_ts = utcfromtimestamp(timestamp) + if (records_ts - bundle_ts).days > margin_publication_days: + result[f"{bid}/{cid}"] = "outdated" + success = False + continue + + result[f"{bid}/{cid}"] = { + "size": len(bundle), + "attachments": nfiles, + "modified": modified, + } + + return success, result + diff --git a/checks/remotesettings/utils.py b/checks/remotesettings/utils.py index 5252903d..d13f94e3 100644 --- a/checks/remotesettings/utils.py +++ b/checks/remotesettings/utils.py @@ -44,6 +44,10 @@ async def server_info(self, *args, **kwargs) -> Dict: async def get_collection(self, *args, **kwargs) -> Dict: return await self._client.get_collection(*args, **kwargs) + @retry_timeout + async def get_collections(self, *args, **kwargs) -> Dict: + return await self._client.get_collections(*args, **kwargs) + @retry_timeout async def get_records(self, *args, **kwargs) -> List[Dict]: return await self._client.get_records(*args, **kwargs) diff --git a/telescope/utils.py b/telescope/utils.py index 0965ce6b..c2717a06 100644 --- a/telescope/utils.py +++ b/telescope/utils.py @@ -1,4 +1,5 @@ import asyncio +import email.utils import json import logging import textwrap @@ -164,6 +165,10 @@ def utcfromisoformat(iso8601): return datetime.fromisoformat(iso8601_tz).replace(tzinfo=timezone.utc) +def utcfromhttpdate(httpdate): + return email.utils.parsedate_to_datetime(httpdate).replace(tzinfo=timezone.utc) + + def render_checks(func): async def wrapper(request): # First, check that client requests supported output format. From 6e98aa11af27dc96e4ac07adde57dcb956bd1d0e Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Mon, 9 Sep 2024 19:39:43 +0200 Subject: [PATCH 2/4] Add check for attachments bundles --- checks/remotesettings/attachments_bundles.py | 73 +++++---- .../test_attachments_bundles.py | 150 ++++++++++++++++++ 2 files changed, 196 insertions(+), 27 deletions(-) create mode 100644 tests/checks/remotesettings/test_attachments_bundles.py diff --git a/checks/remotesettings/attachments_bundles.py b/checks/remotesettings/attachments_bundles.py index f6064c65..2e63d035 100644 --- a/checks/remotesettings/attachments_bundles.py +++ b/checks/remotesettings/attachments_bundles.py @@ -8,8 +8,16 @@ import logging import urllib.parse import zipfile +from typing import Any + from telescope.typings import CheckResult -from telescope.utils import ClientSession, retry_decorator, run_parallel, utcfromhttpdate, utcfromtimestamp +from telescope.utils import ( + ClientSession, + retry_decorator, + run_parallel, + utcfromhttpdate, + utcfromtimestamp, +) from .utils import KintoClient, fetch_signed_resources @@ -20,19 +28,25 @@ @retry_decorator -async def fetch_binary(url: str, **kwargs) -> bytes: +async def fetch_binary(url: str, **kwargs) -> tuple[int, str, bytes]: human_url = urllib.parse.unquote(url) logger.debug(f"Fetch binary from '{human_url}'") async with ClientSession() as session: async with session.get(url, **kwargs) as response: - return (response.status, response.headers["Last-Modified"], await response.read()) + return ( + response.status, + response.headers.get("Last-Modified", "Mon, 01 Jan 1970 00:00:00 GMT"), + await response.read(), + ) -async def run(server: str, auth: str, margin_publication_days: int = 1) -> CheckResult: +async def run( + server: str, auth: str, margin_publication_hours: int = 12 +) -> CheckResult: client = KintoClient(server_url=server, auth=auth) resources = await fetch_signed_resources(server, auth) - resources = [r for r in resources if r["source"]["collection"] in ("intermediates",)] + # resources = [r for r in resources if r["source"]["collection"] in ("intermediates",)] logger.debug("Fetch metadata of %s collections", len(resources)) futures = [ @@ -45,35 +59,38 @@ async def run(server: str, auth: str, margin_publication_days: int = 1) -> Check sources_metadata = await run_parallel(*futures) resources_sources_metadata = zip(resources, sources_metadata) - metadata_for_bundled = [(r, m) for r, m in resources_sources_metadata if m["data"].get("attachment", {}).get("bundle", False)] + metadata_for_bundled = [ + (r, m) + for r, m in resources_sources_metadata + if m["data"].get("attachment", {}).get("bundle", False) + ] logger.info("%s collections with attachments bundle", len(metadata_for_bundled)) + assert metadata_for_bundled, metadata_for_bundled + records_timestamps = [ + resource["last_modified"] for resource, _ in metadata_for_bundled + ] info = await client.server_info() base_url = info["capabilities"]["attachments"]["base_url"] - futures = [] + futures_bundles = [] for resource, metadata in metadata_for_bundled: bid = resource["destination"]["bucket"] cid = metadata["data"]["id"] url = f"{base_url}bundles/{bid}--{cid}.zip" - futures.append(fetch_binary(url, raise_for_status=True)) - bundles = await run_parallel(*futures) - - futures = [] - for resource, _ in metadata_for_bundled: - futures.append(client.get_records_timestamp(bucket=resource["destination"]["bucket"], collection=resource["destination"]["collection"])) - records_timestamps = await run_parallel(*futures) + futures_bundles.append(fetch_binary(url)) + bundles = await run_parallel(*futures_bundles) timestamps_metadata_bundles = zip(records_timestamps, metadata_for_bundled, bundles) - result = {} + result: dict[str, dict[str, Any]] = {} success = True for timestamp, (resource, metadata), bundle in timestamps_metadata_bundles: - status, modified, binary = bundle + http_status, modified, binary = bundle bid = resource["destination"]["bucket"] cid = metadata["data"]["id"] - if status >= 400: - result[f"{bid}/{cid}"] = "missing" + if http_status >= 400: + result[f"{bid}/{cid}"] = {"status": "missing"} success = False continue @@ -81,22 +98,24 @@ async def run(server: str, auth: str, margin_publication_days: int = 1) -> Check z = zipfile.ZipFile(io.BytesIO(binary)) nfiles = len(z.namelist()) except zipfile.BadZipFile: - result[f"{bid}/{cid}"] = "bad zip" + result[f"{bid}/{cid}"] = {"status": "bad zip"} success = False continue bundle_ts = utcfromhttpdate(modified) records_ts = utcfromtimestamp(timestamp) - if (records_ts - bundle_ts).days > margin_publication_days: - result[f"{bid}/{cid}"] = "outdated" - success = False - continue - + status = ( + "outdated" + if ((records_ts - bundle_ts).total_seconds() / 3600) + > margin_publication_hours + else "ok" + ) result[f"{bid}/{cid}"] = { - "size": len(bundle), + "status": status, + "size": len(binary), "attachments": nfiles, - "modified": modified, + "publication_timestamp": bundle_ts.isoformat(), + "collection_timestamp": records_ts.isoformat(), } return success, result - diff --git a/tests/checks/remotesettings/test_attachments_bundles.py b/tests/checks/remotesettings/test_attachments_bundles.py new file mode 100644 index 00000000..6241f177 --- /dev/null +++ b/tests/checks/remotesettings/test_attachments_bundles.py @@ -0,0 +1,150 @@ +import io +import zipfile + +from checks.remotesettings.attachments_bundles import run + + +COLLECTION_URL = "/buckets/{}/collections/{}" +RECORDS_URL = "/buckets/{}/collections/{}/records" +CHANGESET_URL = "/buckets/{}/collections/{}/changeset" + + +def build_zip(num_files=3): + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: + for i in range(num_files): + file_name = f"fake_file_{i}.txt" + zip_file.writestr(file_name, 1024 * "x") + return zip_buffer.getvalue() + + +async def test_negative(mock_responses, mock_aioresponses): + server_url = "http://fake.local/v1" + mock_responses.get( + server_url + "/", + payload={ + "capabilities": { + "attachments": {"base_url": "http://cdn/"}, + "signer": { + "resources": [ + { + "source": {"bucket": "main-workspace", "collection": None}, + "preview": {"bucket": "main-preview", "collection": None}, + "destination": {"bucket": "main", "collection": None}, + } + ] + }, + } + }, + ) + may8_ts = 389664061000 + may8_http = "Mon, 08 May 1982 00:01:01 GMT" + may8_iso = "1982-05-08T00:01:01+00:00" + + changes_url = server_url + RECORDS_URL.format("monitor", "changes") + mock_responses.get( + changes_url, + payload={ + "data": [ + { + "id": "abc", + "bucket": "main", + "collection": "missing", + "last_modified": may8_ts, + }, + { + "id": "efg", + "bucket": "main", + "collection": "ok", + "last_modified": may8_ts, + }, + { + "id": "hij", + "bucket": "main", + "collection": "badzip", + "last_modified": may8_ts, + }, + { + "id": "klm", + "bucket": "main", + "collection": "outdated", + "last_modified": may8_ts + 24 * 3600 * 1000 + 60 * 1000, + }, + { + "id": "nop", + "bucket": "main", + "collection": "late", + "last_modified": may8_ts + 600 * 1000, + }, + { + "id": "qrs", + "bucket": "main", + "collection": "no-bundle", + "last_modified": may8_ts, + }, + ] + }, + ) + + for cid in ("missing", "ok", "badzip", "outdated", "late", "no-bundle"): + mock_responses.get( + server_url + COLLECTION_URL.format("main-workspace", cid), + payload={ + "data": { + "id": cid, + "bucket": "main-workspace", + "attachment": {"bundle": cid != "no-bundle"}, + } + }, + ) + + mock_aioresponses.get("http://cdn/bundles/main--missing.zip", status=404) + mock_aioresponses.get( + "http://cdn/bundles/main--ok.zip", + body=build_zip(), + headers={"Last-Modified": may8_http}, + ) + mock_aioresponses.get( + "http://cdn/bundles/main--outdated.zip", + body=build_zip(num_files=6), + headers={"Last-Modified": may8_http}, + ) + mock_aioresponses.get( + "http://cdn/bundles/main--late.zip", + body=build_zip(num_files=6), + headers={"Last-Modified": may8_http}, + ) + mock_aioresponses.get( + "http://cdn/bundles/main--badzip.zip", + body=b"boom", + headers={"Last-Modified": may8_http}, + ) + + status, data = await run(server_url, auth="") + + assert status is False + assert data == { + "main/badzip": {"status": "bad zip"}, + "main/missing": {"status": "missing"}, + "main/ok": { + "status": "ok", + "attachments": 3, + "collection_timestamp": "1982-05-08T00:01:01+00:00", + "publication_timestamp": may8_iso, + "size": 373, + }, + "main/late": { + "status": "ok", + "attachments": 6, + "collection_timestamp": "1982-05-08T00:11:01+00:00", + "publication_timestamp": may8_iso, + "size": 724, + }, + "main/outdated": { + "attachments": 6, + "collection_timestamp": "1982-05-09T00:02:01+00:00", + "publication_timestamp": may8_iso, + "size": 724, + "status": "outdated", + }, + } From 67f938f8ffc98469c1277ee2ed972327bb1b2f3c Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Tue, 10 Sep 2024 16:57:35 +0200 Subject: [PATCH 3/4] Remove useless change --- checks/remotesettings/utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/checks/remotesettings/utils.py b/checks/remotesettings/utils.py index d13f94e3..5252903d 100644 --- a/checks/remotesettings/utils.py +++ b/checks/remotesettings/utils.py @@ -44,10 +44,6 @@ async def server_info(self, *args, **kwargs) -> Dict: async def get_collection(self, *args, **kwargs) -> Dict: return await self._client.get_collection(*args, **kwargs) - @retry_timeout - async def get_collections(self, *args, **kwargs) -> Dict: - return await self._client.get_collections(*args, **kwargs) - @retry_timeout async def get_records(self, *args, **kwargs) -> List[Dict]: return await self._client.get_records(*args, **kwargs) From 4036c9a91d3ad1f84acb8f7ee32ec12d6f95a9b4 Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Thu, 12 Sep 2024 11:13:56 +0200 Subject: [PATCH 4/4] Remove leftover comment --- checks/remotesettings/attachments_bundles.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/checks/remotesettings/attachments_bundles.py b/checks/remotesettings/attachments_bundles.py index 2e63d035..cf1ee816 100644 --- a/checks/remotesettings/attachments_bundles.py +++ b/checks/remotesettings/attachments_bundles.py @@ -46,8 +46,6 @@ async def run( client = KintoClient(server_url=server, auth=auth) resources = await fetch_signed_resources(server, auth) - # resources = [r for r in resources if r["source"]["collection"] in ("intermediates",)] - logger.debug("Fetch metadata of %s collections", len(resources)) futures = [ client.get_collection(