Skip to content

Commit

Permalink
Merge pull request #168 from LSSTDESC/u/stuart/update_delete_function
Browse files Browse the repository at this point in the history
Update delete function
  • Loading branch information
stuartmcalpine authored Nov 29, 2024
2 parents d359800 + 673bc1a commit 18961ab
Show file tree
Hide file tree
Showing 10 changed files with 201 additions and 32 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## Version 1.0.5

Update delete functionality

- The `delete()` function now takes `name`, `version_string`, `owner` and
`owner_type` as arguments, rather than simply the `dataset_id`.
- One can still delete by the `dataset_id` using the CLI, which now includes a
confirmation step.

## Version 1.0.4

Make more options for querying
Expand Down
6 changes: 3 additions & 3 deletions docs/source/tutorial_notebooks/register_datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@
"source": [
"## 6) Deleting a dataset in the dataregistry\n",
"\n",
"To delete a dataset entry from the dataregistry we call the .delete() function which accepts one argument, the `dataset_id` of the entry you wish to delete, e.g.,"
"To delete a dataset entry from the dataregistry we call the `.delete()` function which accepts four arguments, the `name`, `version_string`, `owner` and `owner_type` of the entry you wish to delete, e.g.,"
]
},
{
Expand All @@ -453,8 +453,8 @@
},
"outputs": [],
"source": [
"# Delete dataset with entry ID == dataset_id\n",
"datareg.Registrar.dataset.delete(dataset_id)"
"# Delete a dataset\n",
"datareg.Registrar.dataset.delete(\"nersc_tutorial:my_first_desc_dataset\", \"1.0.0\", OWNER, \"group\")"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion src/dataregistry/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.4"
__version__ = "1.0.5"
80 changes: 71 additions & 9 deletions src/dataregistry/registrar/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
_ILLEGAL_NAME_CHAR = ["$", "*", "&", "/", "?", "\\", " "]
_ILLEGAL_RELPATH_CHAR = ["$", "*", "&", "?", "\\", " "]


class DatasetTable(BaseTable):
def __init__(self, db_connection, root_dir, owner, owner_type, execution_table):
super().__init__(db_connection, root_dir, owner, owner_type)
Expand Down Expand Up @@ -77,9 +78,15 @@ def _validate_register_inputs(

# If no data is being copied from an `old_location`, `relative_path` is
# required
if kwargs_dict["old_location"] is None and kwargs_dict["location_type"] == "dataregistry" and kwargs_dict["relative_path"] is None:
raise ValueError("A `relative_path` must be passed when `old_location` is None "
"i.e., the data is assumed to be already within the `root_dir`")
if (
kwargs_dict["old_location"] is None
and kwargs_dict["location_type"] == "dataregistry"
and kwargs_dict["relative_path"] is None
):
raise ValueError(
"A `relative_path` must be passed when `old_location` is None "
"i.e., the data is assumed to be already within the `root_dir`"
)

# If external dataset, check for either a `url` or `contact_email`
if kwargs_dict["location_type"] == "external":
Expand All @@ -92,7 +99,6 @@ def _validate_register_inputs(
# Only needed for `register` function, `replace` has no `relative_path`
# argument as this cannot be changed from the original `register`
if "relative_path" in kwargs_dict.keys():

if kwargs_dict["relative_path"] is not None:
if type(kwargs_dict["relative_path"]) != str:
raise ValueError("Relative path is not a valid string")
Expand All @@ -104,12 +110,16 @@ def _validate_register_inputs(
# Make sure `relative_path` is legal (i.e., no illegal characters)
for i_char in _ILLEGAL_RELPATH_CHAR:
if i_char in kwargs_dict["relative_path"]:
raise ValueError(f"Cannot have character {i_char} in relative path")
raise ValueError(
f"Cannot have character {i_char} in relative path"
)

# The '.gen_paths' directory is reserved for autogenerated paths
if kwargs_dict["relative_path"].startswith(".gen_paths"):
raise ValueError("Can't start relative path with '.gen_paths', "
"this is reserved for auto-generated `relative_paths")
raise ValueError(
"Can't start relative path with '.gen_paths', "
"this is reserved for auto-generated `relative_paths"
)

# Assign the `owner_type`
if kwargs_dict["owner_type"] is None:
Expand Down Expand Up @@ -622,7 +632,7 @@ def replace(

# Delete the old data (if not already deleted)
if not get_dataset_status(previous_datasets[-1].status, "deleted"):
self.delete(previous_datasets[-1].dataset_id)
self._delete_by_id(previous_datasets[-1].dataset_id)

# Register the new row in the dataset table
prim_key = self._register_row(name, version, kwargs_dict)
Expand Down Expand Up @@ -807,17 +817,52 @@ def _find_previous(

return rows

def delete(self, dataset_id):
def delete(self, name, version_string, owner, owner_type, confirm=False):
"""
Delete an dataset entry from the DESC data registry.
This will also remove the raw data from the root dir, but the dataset
entry remains in the registry (now with an updated `status` field).
Parameters
----------
name/version_string/owner/owner_type : str
Identifiers for dataset we want to delete from the registry
confirm : bool
Will ask for a confirmation
"""

# Find the dataset entry with this combination
previous = self._find_previous(
name, version_string, owner, owner_type
)

if len(previous) == 0:
raise ValueError(
"No datasets found with combination "
f"{name} {version_string} {owner} {owner_type}"
)

# Delete the entry (-1 index is the latest replace_iteration)
self._delete_by_id(previous[-1].dataset_id, confirm=confirm)

def _delete_by_id(self, dataset_id, confirm=False):
"""
Delete an dataset entry from the DESC data registry.
This will also remove the raw data from the root dir, but the dataset
entry remains in the registry (now with an updated `status` field).
This is not designed to be called directly, as only needing to specify
the `dataset_id` is loose, instead users should call the `delete()`
function.
Parameters
----------
dataset_id : int
Dataset we want to delete from the registry
confirm : bool
Will ask for a confirmation
"""

# First make sure the given dataset id is in the registry
Expand All @@ -828,6 +873,23 @@ def delete(self, dataset_id):
if get_dataset_status(previous_dataset.status, "deleted"):
raise ValueError(f"Dataset {dataset_id} has previously been deleted")

# Confirm the user wants to delete this dataset
if confirm:
confirmation = (
input(
f"Confirm delete of\n"
f"dataset_id: {dataset_id}\nname: {previous_dataset.name}\n"
f"version: {previous_dataset.version_string}"
f"\nowner: {previous_dataset.owner}\n"
f"owner_type: {previous_dataset.owner_type} [y/n] "
)
.strip()
.lower()
)

if confirmation != "y":
return

# Update the status of the dataset to deleted
with self._engine.connect() as conn:
update_stmt = (
Expand Down
30 changes: 27 additions & 3 deletions src/dataregistry_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,34 @@ def get_parser():
# ----------------

# Delete a dataset.
arg_delete_dataset = arg_delete_sub.add_parser("dataset", help="Delete a dataset")
arg_delete_dataset.add_argument(
arg_delete_dataset_id = arg_delete_sub.add_parser(
"dataset_by_id", help="Delete a dataset using the dataset_id"
)
arg_delete_dataset_id.add_argument(
"dataset_id", help="The dataset_id you wish to delete", type=int
)
_add_generic_arguments(arg_delete_dataset_id)

arg_delete_dataset = arg_delete_sub.add_parser(
"dataset",
help="Delete a dataset using the name/version_string/owner/owner_type",
)
arg_delete_dataset.add_argument(
"name", help="The dataset name for the dataset you wish to delete", type=str
)
arg_delete_dataset.add_argument(
"version_string",
help="The dataset version_string for the dataset you wish to delete",
type=str,
)
arg_delete_dataset.add_argument(
"owner", help="The dataset owner for the dataset you wish to delete", type=str
)
arg_delete_dataset.add_argument(
"owner_type",
help="The dataset owner_type for the dataset you wish to delete",
type=str,
)
_add_generic_arguments(arg_delete_dataset)

return parser
Expand Down Expand Up @@ -333,7 +357,7 @@ def main(cmd=None):

# Delete an entry
elif args.subcommand == "delete":
if args.delete_type == "dataset":
if args.delete_type in ["dataset", "dataset_by_id"]:
delete_dataset(args)

# Query database entries
Expand Down
10 changes: 9 additions & 1 deletion src/dataregistry_cli/delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,12 @@ def delete_dataset(args):
site=args.site,
)

datareg.Registrar.dataset.delete(args.dataset_id)
# Deleting directly using the dataset ID
if hasattr(args, "dataset_id"):
datareg.Registrar.dataset._delete_by_id(args.dataset_id, confirm=True)

# Deleting based on name/version/owner/owner_type
else:
datareg.Registrar.dataset.delete(
args.name, args.version_string, args.owner, args.owner_type, confirm=True
)
54 changes: 51 additions & 3 deletions tests/end_to_end_tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ def test_production_entry(dummy_file):
assert len(results["dataset.name"]) == 1, "Bad result from query dcli3"
assert results["dataset.version_string"][0] == "0.1.2"


def test_delete_dataset(dummy_file):
def test_delete_dataset_by_id(dummy_file,monkeypatch):
"""Make a simple entry, then delete it"""

# Establish connection to database
Expand All @@ -111,8 +110,9 @@ def test_delete_dataset(dummy_file):
d_id = results["dataset.dataset_id"][0]

# Delete the dataset
cmd = f"delete dataset {d_id}"
cmd = f"delete dataset_by_id {d_id}"
cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}"
monkeypatch.setattr('builtins.input', lambda _: "y")
cli.main(shlex.split(cmd))

# Check
Expand All @@ -134,6 +134,54 @@ def test_delete_dataset(dummy_file):
assert getattr(r, "dataset.delete_uid") is not None


def test_delete_dataset_by_name(dummy_file,monkeypatch):
"""Make a simple entry, then delete it"""

# Establish connection to database
tmp_src_dir, tmp_root_dir = dummy_file

DNAME = "my_cli_dataset_to_delete2"
DVERSION = "0.0.1"
DOWNER = "delete_owner"
DOWNER_TYPE = "user"

# Register a dataset
cmd = f"register dataset {DNAME} {DVERSION} --location_type dummy"
cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}"
cmd += f" --owner {DOWNER} --owner_type {DOWNER_TYPE}"
monkeypatch.setattr('builtins.input', lambda _: "y")
cli.main(shlex.split(cmd))

# Find the dataset id
datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING)
f = datareg.Query.gen_filter("dataset.name", "==", DNAME)
results = datareg.Query.find_datasets(["dataset.dataset_id"], [f])
assert len(results["dataset.dataset_id"]) == 1, "Bad result from query dcli4"
d_id = results["dataset.dataset_id"][0]

# Delete the dataset
cmd = f"delete dataset {DNAME} {DVERSION} {DOWNER} {DOWNER_TYPE}"
cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}"
cli.main(shlex.split(cmd))

# Check
datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING)
f = datareg.Query.gen_filter("dataset.name", "==", DNAME)
results = datareg.Query.find_datasets(
[
"dataset.dataset_id",
"dataset.delete_date",
"dataset.delete_uid",
"dataset.status",
],
[f],
return_format="cursorresult",
)
for r in results:
assert get_dataset_status(getattr(r, "dataset.status"), "deleted")
assert getattr(r, "dataset.delete_date") is not None
assert getattr(r, "dataset.delete_uid") is not None

def test_dataset_entry_with_keywords(dummy_file):
"""Make a dataset with some keywords tagged"""

Expand Down
38 changes: 28 additions & 10 deletions tests/end_to_end_tests/test_delete_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,27 @@ def test_delete_dataset_bad_entry(dummy_file):

# Make sure we raise an exception trying to delete a dataset that doesn't exist
with pytest.raises(ValueError, match="not found in"):
datareg.Registrar.dataset.delete(10000)
datareg.Registrar.dataset._delete_by_id(10000)


@pytest.mark.parametrize(
"is_dummy,dataset_name",
"is_dummy,dataset_name,delete_by_id",
[
(True, "dummy_dataset_to_delete"),
(False, "real_dataset_to_delete"),
(False, "real_directory_to_delete"),
(True, "dummy_dataset_to_delete_method1", True),
(False, "real_dataset_to_delete_method1", True),
(False, "real_directory_to_delete_method1", True),
(True, "dummy_dataset_to_delete_method2", False),
(False, "real_dataset_to_delete_method2", False),
(False, "real_directory_to_delete_method2", False),
],
)
def test_delete_dataset_entry(dummy_file, is_dummy, dataset_name):
def test_delete_dataset_entry(dummy_file, is_dummy, dataset_name, delete_by_id):
"""
Make a simple entry, then delete it, then check it was deleted.
Does this for a dummy dataset and a real one.
Tests both delete functions `delete()` and `_delete_by_id()`.
"""

# Establish connection to database
Expand All @@ -53,17 +58,27 @@ def test_delete_dataset_entry(dummy_file, is_dummy, dataset_name):
assert os.path.isdir(data_path)
location_type = "dataregistry"

DNAME = f"DESC:datasets:{dataset_name}"
DVERSION = "0.0.1"
DOWNER_TYPE = "user"
DOWNER = "delete_owner"

# Add entry
d_id = _insert_dataset_entry(
datareg,
f"DESC:datasets:{dataset_name}",
"0.0.1",
DNAME,
DVERSION,
location_type=location_type,
old_location=data_path,
owner_type=DOWNER_TYPE,
owner=DOWNER,
)

# Now delete that entry
datareg.Registrar.dataset.delete(d_id)
if delete_by_id:
datareg.Registrar.dataset._delete_by_id(d_id)
else:
datareg.Registrar.dataset.delete(DNAME, DVERSION, DOWNER, DOWNER_TYPE)

# Check the entry was deleted
f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id)
Expand Down Expand Up @@ -101,4 +116,7 @@ def test_delete_dataset_entry(dummy_file, is_dummy, dataset_name):

# Make sure we can not delete an already deleted entry.
with pytest.raises(ValueError, match="previously been deleted"):
datareg.Registrar.dataset.delete(d_id)
if delete_by_id:
datareg.Registrar.dataset._delete_by_id(d_id)
else:
datareg.Registrar.dataset.delete(DNAME, DVERSION, DOWNER, DOWNER_TYPE)
2 changes: 1 addition & 1 deletion tests/end_to_end_tests/test_register_dataset_real_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def test_registering_deleted_relative_path(dummy_file, link):
)

# Now delete
datareg.Registrar.dataset.delete(d2_id)
datareg.Registrar.dataset._delete_by_id(d2_id)

# Make a new entry (new name) using the original relative path
d_id = _insert_dataset_entry(
Expand Down
Loading

0 comments on commit 18961ab

Please sign in to comment.