Skip to content

Commit

Permalink
Merge pull request #950 from saraburns1/remove_old_assets
Browse files Browse the repository at this point in the history
feat: Remove unused aspects assets on import
  • Loading branch information
Cristhian Garcia authored Oct 4, 2024
2 parents 88fd0ed + ab20125 commit 79de281
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 101 deletions.
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ asset by editing the asset in Superset and selecting "Save As" to save it to a n
# Note: If you are using custom assets you will need to rebuild your aspects-superset
# image on your local machine with `tutor images build aspects-superset --no-cache`.

Assets (charts/datasets) created for Aspects that are no longer used can be listed in
`aspects_asset_list.yaml`. These assets & any translated assets created from them,
are deleted from Superset during `init` (specifically `import-assets`). The corresponding
YAML files are deleted during `import_superset_zip` or and `check_superset_assets`.

Sharing Charts and Dashboards
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
173 changes: 95 additions & 78 deletions tutoraspects/asset_command_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,18 @@

FILE_NAME_ATTRIBUTE = "_file_name"

PLUGIN_PATH = os.path.dirname(os.path.abspath(__file__))
PLUGIN_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "templates", "aspects"
)
ASPECT_ASSET_LIST = os.path.join(
PLUGIN_PATH,
"apps",
"superset",
"pythonpath",
"aspects_asset_list.yaml",
)
ASSETS_PATH = os.path.join(
PLUGIN_PATH,
"templates",
"aspects",
"build",
"aspects-superset",
"openedx-assets",
Expand Down Expand Up @@ -465,116 +472,126 @@ def deduplicate_superset_assets(echo):
echo()
echo(click.style(f"{err} errors found!", fg="red"))

echo("Deduplication complete.")


def check_asset_names(echo):
"""
Warn about any duplicate asset names.
"""
echo("Looking for duplicate names...")
warn = 0

names = set()
for file_name, asset in _get_asset_files():
for k in ("slice_name", "dashboard_title", "database_name"):
if k in asset:
if asset[k] in names:
warn += 1
echo(
f"WARNING: Duplicate name {asset[k]} in {file_name}, this "
f"could confuse users, consider changing it."
)
names.add(asset[k])
break

echo(f"{warn} duplicate names detected.")
echo("De-duplication complete.")


def _get_all_chart_dataset_uuids():
def _get_all_uuids():
"""
Return the UUIDs of all datasets and charts in our file assets.
Return the UUIDs of all assets.
"""
all_dataset_uuids = {}
all_chart_uuids = {}
all_uuids = {"charts": {}, "datasets": {}}

# First get all known uuid's
for _, asset in _get_asset_files():
for file_path, asset in _get_asset_files():
if "slice_name" in asset:
all_chart_uuids[asset["uuid"]] = asset["slice_name"]
all_uuids["charts"][asset["uuid"]] = {
"name": asset["_file_name"],
"file_path": file_path,
}
elif "table_name" in asset:
all_dataset_uuids[asset["uuid"]] = asset["table_name"]
all_uuids["datasets"][asset["uuid"]] = {
"name": asset["table_name"],
"file_path": file_path,
}

return all_dataset_uuids, all_chart_uuids
return all_uuids


def _get_used_chart_dataset_uuids():
def _get_used_uuids():
"""
Return the UUIDs of all datasets and charts actually used in our file assets.
"""
used_dataset_uuids = set()
used_chart_uuids = set()
used_uuids = {"charts": set(), "datasets": set()}

for _, asset in _get_asset_files():
if "dashboard_title" in asset:
filters = asset["metadata"].get("native_filter_configuration", [])

for filter_config in filters:
for filter_dataset in filter_config.get("target", {}).get(
"datasetUuid", []
):
used_dataset_uuids.add(filter_dataset)
for item in filter_config.get("targets", {}):
if item.get("datasetUuid"):
used_uuids["datasets"].add(item.get("datasetUuid"))

for pos in asset["position"]:
if pos.startswith("CHART-"):
slice_uuid = asset["position"][pos]["meta"].get("uuid")

if slice_uuid:
used_chart_uuids.add(slice_uuid)
used_uuids["charts"].add(asset["position"][pos]["meta"].get("uuid"))

if "slice_name" in asset:
dataset_uuid = asset["dataset_uuid"]
used_dataset_uuids.add(dataset_uuid)
used_uuids["datasets"].add(asset["dataset_uuid"])

return used_dataset_uuids, used_chart_uuids
return used_uuids


def check_orphan_assets(echo):
def _find_unused_assets():
"""
Warn about any potentially unused assets.
Find potentially unused assets.
UUIDs listed as 'ignored' in aspects_asset_list.yaml are owned
by Aspects and will be removed from the list of potential unused assets.
"""
echo("Looking for potentially orphaned assets...")
all_uuids = _get_all_uuids()
used_uuids = _get_used_uuids()

all_dataset_uuids, all_chart_uuids = _get_all_chart_dataset_uuids()
used_dataset_uuids, used_chart_uuids = _get_used_chart_dataset_uuids()
# Remove uuids from 'all' list that are in used list
for asset_type, uuids in used_uuids.items():
for uuid in uuids or []:
try:
all_uuids[asset_type].pop(uuid)
except KeyError:
click.echo(
click.style(
f"WARNING: {asset_type} {uuid} used but not found!", fg="red"
)
)

for k in used_dataset_uuids:
try:
all_dataset_uuids.pop(k)
except KeyError:
click.echo(
click.style(f"WARNING: Dataset {k} used nut not found!", fg="red")
)
# Remove uuids from 'all' list that are in ignored yaml
with open(ASPECT_ASSET_LIST, "r", encoding="utf-8") as file:
aspects_assets = yaml.safe_load(file)

# Remove the "Query performance" chart from the list, it's needed for
# the performance_metrics script, but not in any dashboard.
all_chart_uuids.pop("bb13bb31-c797-4ed3-a7f9-7825cc6dc482", None)
ignored_uuids = aspects_assets.get("ignored_uuids")
for asset_type in ignored_uuids:
for uuid in ignored_uuids[asset_type] or []:
all_uuids[asset_type].pop(uuid, None)

for k in used_chart_uuids:
try:
all_chart_uuids.pop(k)
except KeyError:
click.echo(click.style(f"WARNING: Chart {k} used nut not found!", fg="red"))
return all_uuids

echo()

if all_dataset_uuids:
echo(click.style("Potentially unused datasets detected:", fg="yellow"))
echo("\n".join(sorted(all_dataset_uuids.values())))
def delete_aspects_unused_assets(echo):
"""
Warn about any potentially unused assets AND
delete any unused chart and dataset yamls whose UUIDs are listed in
aspects_assets_list.yaml - these are owned by Aspects and can safely
be deleted.
"""
unused_uuids = _find_unused_assets()

count_unused_uuids = sum(len(uuids) for asset_type, uuids in unused_uuids.items())
if count_unused_uuids:
with open(ASPECT_ASSET_LIST, "r", encoding="utf-8") as file:
aspects_assets = yaml.safe_load(file)

unused_aspects_uuids = aspects_assets.get("unused_uuids")
for asset_type in unused_aspects_uuids:
for uuid in unused_aspects_uuids[asset_type] or []:
if uuid in unused_uuids[asset_type]:
data = unused_uuids[asset_type][uuid]
echo(
f"Deleting unused {asset_type} yaml {data.get('name')} (UUID: {uuid})"
)
os.remove(data.get("file_path"))
unused_uuids[asset_type].pop(uuid)

new_count_unused_uuids = sum(
len(uuids) for asset_type, uuids in unused_uuids.items()
)

if all_chart_uuids:
echo(click.style("Potentially unused charts detected:", fg="yellow"))
echo("\n".join(sorted(all_chart_uuids.values())))
if new_count_unused_uuids:
echo(click.style("Potentially unused assets detected:", fg="yellow"))
echo(
click.style(
"Add the UUIDs to aspects_asset_list.yaml to be deleted", fg="green"
)
)

if not all_dataset_uuids and not all_chart_uuids:
echo(f"{len(all_chart_uuids) + len(all_dataset_uuids)} orphans detected.")
for asset_type, uuids in unused_uuids.items():
for uuid, data in uuids.items():
echo(f'{asset_type} {data.get("name")} (UUID: {uuid})')
13 changes: 3 additions & 10 deletions tutoraspects/commands_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@
from tutoraspects.asset_command_helpers import (
ASSETS_PATH,
SupersetCommandError,
check_asset_names,
check_orphan_assets,
deduplicate_superset_assets,
import_superset_assets,
delete_aspects_unused_assets,
)


Expand Down Expand Up @@ -366,9 +365,7 @@ def serialize_zip(file, base_assets_path):

click.echo()
deduplicate_superset_assets(click.echo)

click.echo()
check_asset_names(click.echo)
delete_aspects_unused_assets(click.echo)

click.echo()
click.echo("Asset merge complete!")
Expand All @@ -387,11 +384,7 @@ def check_superset_assets():
Deduplicate assets by UUID, and check for duplicate asset names.
"""
deduplicate_superset_assets(click.echo)

click.echo()
check_asset_names(click.echo)
click.echo()
check_orphan_assets(click.echo)
delete_aspects_unused_assets(click.echo)

click.echo()
click.echo(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
unused_uuids:
datasets:
charts:
- a433e3cc-8ed5-454a-8b17-5dd75cfc84e4 # Course Info
- e0098cfe-a312-4c49-8efd-7e74256b6ea4 # Course info
- d3ae0546-37a8-4841-a57b-8087a6c33049 # Evolution of engagement
- 2f2af8b0-94ae-4300-b71f-3bd7f9fc127c # Enrollment counts
- bd37be7f-6672-4dca-80ae-a54f69d169da # Enrollment counts
- dde44a03-649f-4d77-990b-a95be204e1ba # Learner performance
- 4e48b8f9-e757-4263-a9d7-d18018620a24 # Learner performance
- 4250c976-a9b7-43ff-b5ad-8dd00a5acef7 # Learner performance breakdown
- 9c3f7291-1bd9-4b2f-abc0-472aad3aff06 # Learner performance breakdown
- 8b0535a8-a43f-49bf-9d50-439a16bd3f74 # Video engagement

ignored_uuids:
datasets:
charts:
- bb13bb31-c797-4ed3-a7f9-7825cc6dc482 # "Query performance" needed for the performance_metrics script
Loading

0 comments on commit 79de281

Please sign in to comment.