Skip to content

Commit

Permalink
util: Add blob sync check utility (PROJQUAY-9999)
Browse files Browse the repository at this point in the history
This utility checks that all blobs referenced by the `imagestorage` table are indeed stored in the storage engine. If a blob is referenced in the database, but is not referenced in the backing store, the utility will note that blob and will print a list of unreferenced blobs at the end of execution. Example:

~~~
$ python3 blobtest.py  config.yaml
Quay config path: config.yaml
Establishing connection with database on hostname cyberdyne.skynet.
Found 645 blobs in imagestorage table.
Trying to establish a connection to the storage provider.
Searching for missing blobs...
Found 1 missing blobs.
Complete list: ['sha256:a9e613ba123f6eea745d2fa17c8d5d7a50483bc122625b30bd5a0a64519c3e40']
~~~

The utlity takes two arguments:
- `/path/to/quay/config.yaml` file (required, if the utility is run within Quay's container path should be `/quay-registry/conf/stack/config.yaml)
- `-d, --debug`: prints additional debug statements concerning S3 access

Notable caveats:
- As of right now, the utility only supports S3 storage engines. Azure and Swift are not supported.
- The utility only checks the first storage engine that is defined. It does not check for georeplication and if multiple storage engines are defined, it only checks the first one in the list. Future implementations will take georeplication into account.
  • Loading branch information
ibazulic committed Nov 14, 2024
1 parent ece794e commit 96f1dab
Showing 1 changed file with 154 additions and 0 deletions.
154 changes: 154 additions & 0 deletions util/blobcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import logging
import re
import sys
from argparse import ArgumentParser
from urllib.parse import urlparse

import boto3
import boto3.session
import requests
import yaml
from botocore.client import Config
from peewee import *


class MyParser(ArgumentParser):
def error(self, message):
sys.stdout.write("Error: %s\n" % message)
self.print_help()
sys.exit(2)


STORAGE_ENGINE_LIST = [
"RadosGWStorage",
"S3Storage",
"IBMCloudStorage",
"RHOCSStorage",
]


def main():
parser = MyParser(
prog="python-blob-check",
description="Check blob status between Quay db and storage",
)

parser.add_argument("filename", help="Path to Quay's config.yaml file")
parser.add_argument("-d", "--debug", help="Turn S3 debugging on", action="store_true")
args = parser.parse_args()
print("Quay config path: {}".format(args.__dict__["filename"]))

with open(args.__dict__["filename"]) as file_stream:
try:
config = yaml.safe_load(file_stream)
except yaml.YAMLError as e:
print("Encountered error during parsing: {}".format(e))
sys.exit(2)

# We need to figure out what the name of the first storage engine is, because we'll only check 1st storage config, and only if it's S3 compatible.
# This is ugly, but works.
# We will check only the first listed configuration, assume that's the primary one.
storage_config = config["DISTRIBUTED_STORAGE_CONFIG"][
list(config["DISTRIBUTED_STORAGE_CONFIG"].keys())[0]
]

if storage_config[0] not in STORAGE_ENGINE_LIST:
print("Storage engine must be S3 compatible.")
sys.exit(2)

db_uri = urlparse(config["DB_URI"])

if db_uri.scheme != "postgresql":
print("This utility only supports PostgreSQL as database backend.")
sys.exit(2)

# split net location by the `@` symbol
# last parameter in the list should always be our hostname
# ugly, but seems to work
netloc_list = re.split(r"(@)", db_uri.netloc)

if ":" in netloc_list[-1]:
DB_HOSTNAME = netloc_list[-1].split(":")[0]
DB_PORT = netloc_list[-1].split(":")[1]
else:
DB_HOSTNAME = netloc_list[-1]
DB_PORT = 5432

# we have to remove the last `@` symbol from the location list to extract the username and password
DB_USERNAME = "".join(netloc_list[:-2]).split(":")[0]
DB_PASSWORD = "".join(netloc_list[:-2]).split(":")[1]
DB_NAME = db_uri.path.split("/")[1]

print("Establishing connection with database on hostname {}.".format(DB_HOSTNAME))

db = PostgresqlDatabase(
DB_NAME, user=DB_USERNAME, password=DB_PASSWORD, host=DB_HOSTNAME, port=DB_PORT
)
db.connect()

cursor = db.execute_sql("SELECT content_checksum FROM imagestorage;")
blobs = cursor.fetchall()

print("Found {} blobs in imagestorage table.".format(len(blobs)))

print("Trying to establish a connection to the storage provider.")

if args.debug:
boto3.set_stream_logger("", logging.DEBUG)
if storage_config[0] == "S3Storage":
s3_client = boto3.client(
"s3",
region_name=storage_config[1]["s3_region"],
aws_access_key_id=storage_config[1]["s3_access_key"],
aws_secret_access_key=storage_config[1]["s3_secret_key"],
endpoint_url="https://s3.{region}.amazonaws.com".format(
region=storage_config[1]["s3_region"]
),
config=Config(signature_version="s3v4"),
)
else:
s3_client = boto3.client(
"s3",
aws_access_key_id=storage_config[1]["access_key"],
aws_secret_access_key=storage_config[1]["secret_key"],
endpoint_url="https://{hostname}:{port}".format(
hostname=storage_config[1]["hostname"], port=storage_config[1]["port"]
)
if storage_config[1]["is_secure"] == True
else "http://{hostname}:{port}".format(
hostname=storage_config[1]["hostname"], port=storage_config[1]["port"]
),
config=Config(signature_version="s3v4"),
)

missing_blobs = []
print("Searching for missing blobs...")
for blob in blobs:
blobname = blob[0].split(":")[1]
blobdir = blobname[:2]
url = s3_client.generate_presigned_url(
ClientMethod="head_object",
Params={
"Bucket": storage_config[1]["s3_bucket"]
if storage_config[0] == "S3Storage"
else storage_config[1]["bucket_name"],
"Key": "{path}/sha256/{dir}/{blobname}".format(
path=storage_config[1]["storage_path"][1:],
dir=blobdir,
blobname=blobname,
),
},
)

response = requests.head(url)
if response.status_code != 200:
missing_blobs.append("sha256:" + blobname)
if missing_blobs:
print("Found {} missing blobs.".format(len(missing_blobs)))
print("Complete list: {}".format(missing_blobs))
else:
print("All blobs OK!")


if __name__ == "__main__":
main()

0 comments on commit 96f1dab

Please sign in to comment.