Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add total size counting for object-storage clean command #104

Merged
merged 2 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 76 additions & 18 deletions ch_tools/chadmin/cli/object_storage_group.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import logging
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from tempfile import TemporaryFile
from typing import List, Optional

import click
from click import Context, group, option, pass_context
from humanfriendly import format_size

from ch_tools.chadmin.cli import get_clickhouse_config
from ch_tools.chadmin.internal.object_storage import (
cleanup_s3_object_storage,
s3_object_storage_iterator,
)
from ch_tools.chadmin.internal.utils import execute_query
from ch_tools.common.cli.formatting import print_response
from ch_tools.common.cli.parameters import TimeSpanParamType
from ch_tools.common.clickhouse.config.storage_configuration import S3DiskConfiguration

Expand All @@ -27,6 +30,17 @@
STREAM_TIMEOUT = 10 * 60


@dataclass
class ObjListItem:
path: str
size: int

@classmethod
def from_tab_separated(cls, value: str) -> "ObjListItem":
path, size = value.split("\t")
return cls(path, int(size))


@group("object-storage")
@option(
"-d",
Expand Down Expand Up @@ -137,7 +151,7 @@ def clean_command(
try:
execute_query(
ctx,
f"CREATE TABLE IF NOT EXISTS {listing_table} (obj_path String) ENGINE MergeTree ORDER BY obj_path",
f"CREATE TABLE IF NOT EXISTS {listing_table} (obj_path String, obj_size UInt64) ENGINE MergeTree ORDER BY obj_path",
)
_clean_object_storage(
ctx,
Expand All @@ -153,7 +167,7 @@ def clean_command(
finally:
if not keep_paths:
execute_query(
ctx, f"TRUNCATE TABLE IF EXISTS {listing_table}", format_=None
ctx, f"DROP TABLE IF EXISTS {listing_table} SYNC", format_=None
)


Expand All @@ -175,9 +189,12 @@ def _clean_object_storage(
prefix = object_name_prefix or disk_conf.prefix

if not use_saved_list:
click.echo(
f"Collecting objects... (Disk: '{disk_conf.name}', Endpoint '{disk_conf.endpoint_url}', "
f"Bucket: {disk_conf.bucket_name}, Prefix: '{prefix}')"
logging.info(
"Collecting objects... (Disk: '%s', Endpoint '%s', Bucket: '%s', Prefix: '%s')",
disk_conf.name,
disk_conf.endpoint_url,
disk_conf.bucket_name,
prefix,
)
_traverse_object_storage(ctx, listing_table, from_time, to_time, prefix)

Expand All @@ -188,19 +205,20 @@ def _clean_object_storage(
)

antijoin_query = f"""
SELECT obj_path FROM {listing_table} AS object_storage
SELECT obj_path, obj_size FROM {listing_table} AS object_storage
LEFT ANTI JOIN {remote_data_paths_table} AS object_table
ON object_table.remote_path = object_storage.obj_path
AND object_table.disk_name = '{disk_conf.name}'
"""
logging.info("Antijoin query: %s", antijoin_query)

if dry_run:
click.echo("Counting orphaned objects...")
logging.info("Counting orphaned objects...")
else:
click.echo("Deleting orphaned objects...")
logging.info("Deleting orphaned objects...")

deleted = 0
total_size = 0
with TemporaryFile() as keys_file:
with execute_query(
ctx, antijoin_query, stream=True, format_="TabSeparated"
Expand All @@ -210,11 +228,51 @@ def _clean_object_storage(
keys_file.write(chunk)

keys_file.seek(0) # rewind file pointer to the beginning
keys = (line.decode().strip() for line in keys_file)
deleted = cleanup_s3_object_storage(disk_conf, keys, dry_run)

click.echo(
f"{'Would delete' if dry_run else 'Deleted'} {deleted} objects from bucket [{disk_conf.bucket_name}] with prefix {prefix}"
# Generator producing keys from temporary file with counting of statistics
def keys():
nonlocal deleted, total_size
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As for me it is non trivial logic.
Can we count total size and the number of deleted objects inside cleanup_s3_object_storage and use return value for some statistic that we have collected? Also we already have logic to count the number of deleted objects:

for chunk in chunked(keys, BULK_DELETE_CHUNK_SIZE):
if not dry_run:
_bulk_delete(bucket, chunk)
deleted += len(chunk)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, partially. I thought to do that, but then decided not to count size statistics in cleanup function. But now I'll do that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

for line in keys_file:
obj = ObjListItem.from_tab_separated(line.decode().strip())
yield obj.path
deleted += 1
total_size += obj.size

cleanup_s3_object_storage(disk_conf, keys(), dry_run)

logging.info(
"%s %s objects with total size %s from bucket [%s] with prefix %s",
"Would delete" if dry_run else "Deleted",
deleted,
format_size(total_size, binary=True),
disk_conf.bucket_name,
prefix,
)
_print_response(ctx, dry_run, deleted, total_size)


def _print_response(ctx: Context, dry_run: bool, deleted: int, total_size: int) -> None:
"""
Outputs result of cleaning.
"""
# List of dicts for print_response()
clean_stats = [
{"WouldDelete" if dry_run else "Deleted": deleted, "TotalSize": total_size}
]

def _table_formatter(stats):
result = {}

if "Deleted" in stats:
result["Deleted"] = stats["Deleted"]
if "WouldDeleted" in stats:
result["WouldDeleted"] = stats["WouldDeleted"]
result["TotalSize"] = format_size(stats["TotalSize"], binary=True)

return result

print_response(
ctx, clean_stats, default_format="table", table_formatter=_table_formatter
)


Expand All @@ -228,7 +286,7 @@ def _traverse_object_storage(
"""
Traverse S3 disk's bucket and put object names to the ClickHouse table.
"""
obj_paths_batch = []
obj_paths_batch: List[ObjListItem] = []
counter = 0
now = datetime.now(timezone.utc)

Expand All @@ -240,7 +298,7 @@ def _traverse_object_storage(
if from_time is not None and obj.last_modified < now - from_time:
continue

obj_paths_batch.append(obj.key)
obj_paths_batch.append(ObjListItem(obj.key, obj.size))
counter += 1
if len(obj_paths_batch) >= INSERT_BATCH_SIZE:
_insert_listing_batch(ctx, obj_paths_batch, listing_table)
Expand All @@ -250,19 +308,19 @@ def _traverse_object_storage(
if obj_paths_batch:
_insert_listing_batch(ctx, obj_paths_batch, listing_table)

click.echo(f"Collected {counter} objects")
logging.info("Collected %s objects", counter)


def _insert_listing_batch(
ctx: Context, obj_paths_batch: List[str], listing_table: str
ctx: Context, obj_paths_batch: List[ObjListItem], listing_table: str
) -> None:
"""
Insert batch of object names to the listing table.
"""
batch_values = ",".join(f"('{obj_path}')" for obj_path in obj_paths_batch)
batch_values = ",".join(f"('{item.path}',{item.size})" for item in obj_paths_batch)
execute_query(
ctx,
f"INSERT INTO {listing_table} (obj_path) VALUES {batch_values}",
f"INSERT INTO {listing_table} (obj_path, obj_size) VALUES {batch_values}",
format_=None,
)

Expand Down
72 changes: 42 additions & 30 deletions tests/features/object_storage.feature
Original file line number Diff line number Diff line change
Expand Up @@ -21,37 +21,41 @@ Feature: chadmin object-storage commands
Scenario: Dry-run clean on one replica with guard period
When we execute command on clickhouse01
"""
chadmin object-storage clean --dry-run
chadmin --format yaml object-storage clean --dry-run
"""
Then we get response contains
"""
Would delete 0 objects from bucket [cloud-storage-test]
- WouldDelete: 0
TotalSize: 0
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --to-time 0h --dry-run
chadmin --format yaml object-storage clean --to-time 0h --dry-run
"""
Then we get response matches
"""
Would delete [1-9][0-9]* objects from bucket \[cloud-storage-test\]
- WouldDelete: [1-9][0-9]*
TotalSize: [1-9][0-9]*
"""

Scenario: Dry-run clean on cluster with guard period
When we execute command on clickhouse01
"""
chadmin object-storage clean --dry-run --on-cluster
chadmin --format yaml object-storage clean --dry-run --on-cluster
"""
Then we get response contains
"""
Would delete 0 objects from bucket [cloud-storage-test]
- WouldDelete: 0
TotalSize: 0
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --to-time 0h --dry-run --on-cluster
chadmin --format yaml object-storage clean --to-time 0h --dry-run --on-cluster
"""
Then we get response contains
"""
Would delete 0 objects from bucket [cloud-storage-test]
- WouldDelete: 0
TotalSize: 0
"""

Scenario: Clean orphaned objects
Expand All @@ -63,86 +67,93 @@ Feature: chadmin object-storage commands
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --dry-run --to-time 0h --on-cluster
chadmin --format yaml object-storage clean --dry-run --to-time 0h --on-cluster
"""
Then we get response contains
"""
Would delete 1 objects from bucket [cloud-storage-test]
- WouldDelete: 1
TotalSize: 1
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --to-time 0h --on-cluster
chadmin --format yaml object-storage clean --to-time 0h --on-cluster
"""
Then we get response contains
"""
Deleted 1 objects from bucket [cloud-storage-test]
- Deleted: 1
TotalSize: 1
"""
And path does not exist in S3
"""
bucket: cloud-storage-test
path: /data/orpaned_object.tsv
"""

Scenario: Clean many orphaned objects
When we put 100 objects in S3
"""
bucket: cloud-storage-test
path: /data/orpaned_object-{}
data: '1'
data: '10'
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --dry-run --to-time 0h --on-cluster
chadmin --format yaml object-storage clean --dry-run --to-time 0h --on-cluster
"""
Then we get response contains
"""
Would delete 100 objects from bucket [cloud-storage-test]
- WouldDelete: 100
TotalSize: 200
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --to-time 0h --on-cluster
chadmin --format yaml object-storage clean --to-time 0h --on-cluster
"""
Then we get response contains
"""
Deleted 100 objects from bucket [cloud-storage-test]
- Deleted: 100
TotalSize: 200
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --to-time 0h --dry-run --on-cluster
chadmin --format yaml object-storage clean --to-time 0h --dry-run --on-cluster
"""
Then we get response contains
"""
Would delete 0 objects from bucket [cloud-storage-test]
- WouldDelete: 0
TotalSize: 0
"""

Scenario: Clean orphaned objects with prefix
When we put object in S3
"""
bucket: cloud-storage-test
path: /data_1/orpaned_object.tsv
data: '1'
data: '10'
"""
When we put object in S3
"""
bucket: cloud-storage-test
path: /data_2/orpaned_object.tsv
data: '1'
data: '100'
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --dry-run --to-time 0h --on-cluster --prefix "data_1"
chadmin --format yaml object-storage clean --dry-run --to-time 0h --on-cluster --prefix "data_1"
"""
Then we get response contains
"""
Would delete 1 objects from bucket [cloud-storage-test]
- WouldDelete: 1
TotalSize: 2
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --to-time 0h --on-cluster --prefix "data_1"
chadmin --format yaml object-storage clean --to-time 0h --on-cluster --prefix "data_1"
"""
Then we get response contains
"""
Deleted 1 objects from bucket [cloud-storage-test]
- Deleted: 1
TotalSize: 2
"""
And path does not exist in S3
"""
Expand All @@ -151,9 +162,10 @@ Feature: chadmin object-storage commands
"""
When we execute command on clickhouse01
"""
chadmin object-storage clean --dry-run --to-time 0h --on-cluster --prefix "data_2"
chadmin --format yaml object-storage clean --dry-run --to-time 0h --on-cluster --prefix "data_2"
"""
Then we get response contains
"""
Would delete 1 objects from bucket [cloud-storage-test]
"""
- WouldDelete: 1
TotalSize: 3
"""
Loading