diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fe55bd2..d4b4756e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## Version 1.0.3 + +Some changes to the way the `relative_path` is automatically generated from the +`name` and `version`. + +- All automatically generated `relative_paths` are placed in a top level + `.gen_paths` directory, e.g., + `////.gen_paths`. This is to prevent + clashes with user specified `relative_paths`. +- Single files no longer have their filename changed when automatically + generating the `relative_path`. Instead a directory containing the `name` and + `version` is created, and the file is copied there. This preserves the + filname suffix in the relative path. ## Version 1.0.2 diff --git a/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb b/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb index 391245e8..c4e4d606 100644 --- a/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb +++ b/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb @@ -180,7 +180,9 @@ "\n", "The files and directories of registered datasets are stored under a path relative to the root directory (`root_dir`), which, by default, is a shared space at NERSC.\n", "\n", - "By default, the `relative_path` is constructed from the `name` and `version`, in the format `relative_path=/. However, one can also manually select the relative_path during registration, for example" + "By default, when not manually specified, the `relative_path` is constructed from the `name` and `version`, in the format `relative_path=.gen_paths/_/`. \n", + "\n", + "One can manually select the `relative_path` during registration if they explicitly care about where the data is located relative to the `root_dir`, for example" ] }, { @@ -208,9 +210,26 @@ "source": [ "will register a dataset under the `relative_path` of `nersc_tutorial/my_desc_dataset`.\n", "\n", - "For those interested, the eventual full path for the dataset will be `////`. Naturally, the `relative_path` you select cannot already be taken by another dataset (an error will be raised in this case).\n", + "If the registered dataset was a single file, the `relative_path` will be the explicit (relative) pathname to that file, e.g., `.gen_paths/mydataset_1.0.0/myfile.txt` or `my/manual/path/myfile.txt`. If the registered dataset was a directory, the `relative_path` is the pathname to the directory containing the dataset contents.\n", + "\n", + "For those interested, the eventual full path for the dataset will be `////`. Naturally, the `relative_path` you select cannot already be taken by another dataset (an error will be raised in this case), and any manually specified `relative_path` cannot start with `.gen_paths` as this directory is reserved for autogenerated `relative_path`s.\n", + "\n", + "When you overwrite a previous dataset entry using the `replace()` function, the original `relative_path` at registration (automatically generated or manual) will be used.\n", + "\n", + "One can construct the full absolute path to a dataregistry file using the `get_dataset_absolute_path()` helper function, e.g.," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f193290d-892a-47b6-8e91-8e94a10d506f", + "metadata": {}, + "outputs": [], + "source": [ + "# Find the full absolute path to a dataset using the dataset id\n", + "absolute_path = datareg.Query.get_dataset_absolute_path(dataset_id)\n", "\n", - "When you overwrite a previous dataset entry using the `replace()` function, the original `relative_path` at registration (automatically generated or manual) will be used." + "print(f\"The absolute path for {dataset_id} is {absolute_path}\"" ] }, { diff --git a/src/dataregistry/_version.py b/src/dataregistry/_version.py index 7863915f..976498ab 100644 --- a/src/dataregistry/_version.py +++ b/src/dataregistry/_version.py @@ -1 +1 @@ -__version__ = "1.0.2" +__version__ = "1.0.3" diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 85034977..e7fffb25 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -24,7 +24,7 @@ from .dataset_util import set_dataset_status, get_dataset_status _ILLEGAL_NAME_CHAR = ["$", "*", "&", "/", "?", "\\", " "] - +_ILLEGAL_RELPATH_CHAR = ["$", "*", "&", "?", "\\", " "] class DatasetTable(BaseTable): def __init__(self, db_connection, root_dir, owner, owner_type, execution_table): @@ -83,6 +83,29 @@ def _validate_register_inputs( "External datasets require either a url or contact_email" ) + # Make sure the user passed `relative_path` is legal + # Only needed for `register` function, `replace` has no `relative_path` + # argument as this cannot be changed from the original `register` + if "relative_path" in kwargs_dict.keys(): + + if kwargs_dict["relative_path"] is not None: + if type(kwargs_dict["relative_path"]) != str: + raise ValueError("Relative path is not a valid string") + + # Relative path cannot start with a "/" + if kwargs_dict["relative_path"][0] == "/": + raise ValueError("Relative path cannot start with a '/'") + + # Make sure `relative_path` is legal (i.e., no illegal characters) + for i_char in _ILLEGAL_RELPATH_CHAR: + if i_char in kwargs_dict["relative_path"]: + raise ValueError(f"Cannot have character {i_char} in relative path") + + # The '.gen_paths' directory is reserved for autogenerated paths + if kwargs_dict["relative_path"].startswith(".gen_paths"): + raise ValueError("Can't start relative path with '.gen_paths', " + "this is reserved for auto-generated `relative_paths") + # Assign the `owner_type` if kwargs_dict["owner_type"] is None: if self._owner_type is not None: @@ -409,7 +432,7 @@ def register( # If `relative_path` not passed, automatically generate it if kwargs_dict["relative_path"] is None: kwargs_dict["relative_path"] = _relpath_from_name( - name, kwargs_dict["version_string"] + name, kwargs_dict["version_string"], kwargs_dict["old_location"] ) # Make sure the relative_path in the `root_dir` is avaliable diff --git a/src/dataregistry/registrar/registrar_util.py b/src/dataregistry/registrar/registrar_util.py index 1478b3fd..77f37b36 100644 --- a/src/dataregistry/registrar/registrar_util.py +++ b/src/dataregistry/registrar/registrar_util.py @@ -327,17 +327,30 @@ def _compute_checksum(file_path): raise Exception(e) -def _relpath_from_name(name, version): +def _relpath_from_name(name, version, old_location): """ Construct a relative path from the name and version of a dataset. We use this when the `relative_path` is not explicitly defined. + Every automatically generated `relative_path` is prefixed with + `.gen_paths/`, meaning that all automatically generated `relative_paths` go + into this top level folder. This is to prevent clashes with user specified + `relative_path`'s. + + The auto-generated `relative_path` will be a directory that contains the + name and version, which is where the ingested data (from `old_location`) + will eventually reside. If the data being ingested is a single file, the + `relative_path` will be the full path to the file within the registry, not + just the directory that contains the file. + Parameters ---------- name : str Dataset name version : str Dataset version + old_location : str + Path the data is coming from (needed to parse filename) Returns ------- @@ -345,4 +358,9 @@ def _relpath_from_name(name, version): Automatically generated `relative_path` """ - return f"{name}_{version}" + # For single files, scrape the filename and add it to the `relative_path` + if (old_location is not None) and os.path.isfile(old_location): + return os.path.join(".gen_paths", f"{name}_{version}", os.path.basename(old_location)) + else: + # For directories, only need the autogenerated directory name + return os.path.join(".gen_paths", f"{name}_{version}") diff --git a/tests/end_to_end_tests/test_register_dataset_dummy.py b/tests/end_to_end_tests/test_register_dataset_dummy.py index 6fa8ccc3..07f65746 100644 --- a/tests/end_to_end_tests/test_register_dataset_dummy.py +++ b/tests/end_to_end_tests/test_register_dataset_dummy.py @@ -49,7 +49,7 @@ def test_register_dataset_defaults(dummy_file): assert results["owner"][0] == os.getenv("USER") assert results["owner_type"][0] == "user" assert results["description"][0] == None - assert results["relative_path"][0] == f"{_NAME}_0.0.1" + assert results["relative_path"][0] == f".gen_paths/{_NAME}_0.0.1" assert results["data_org"][0] == "dummy" assert results["execution_id"][0] >= 0 assert results["dataset_id"][0] >= 0 @@ -192,7 +192,7 @@ def test_dataset_bumping(dummy_file, v_type, ans): # Check the result assert results["dataset.name"][0] == _NAME assert results["dataset.version_string"][0] == ans - assert results["dataset.relative_path"][0] == f"{_NAME}_{ans}" + assert results["dataset.relative_path"][0] == f".gen_paths/{_NAME}_{ans}" @pytest.mark.parametrize("owner_type", ["user", "group", "project"]) diff --git a/tests/end_to_end_tests/test_register_dataset_real_data.py b/tests/end_to_end_tests/test_register_dataset_real_data.py index 015b54ca..9e0c04de 100644 --- a/tests/end_to_end_tests/test_register_dataset_real_data.py +++ b/tests/end_to_end_tests/test_register_dataset_real_data.py @@ -146,6 +146,40 @@ def test_registering_bad_relative_path(dummy_file, link): relative_path=f"test/register/bad/relpath/{link}", ) +@pytest.mark.parametrize("link", ["file1.txt", "directory1"]) +def test_registering_bad_relative_path_2(dummy_file, link): + """ + Make sure we cannot register a datataset to a relative path that is using + the auto generated .gen_paths directry. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) + + data_path = str(tmp_src_dir / link) + + # Test going into the '.gen_paths' folder (not allowed) + with pytest.raises(ValueError, match="Can't start relative path with '.gen_paths'"): + d_id = _insert_dataset_entry( + datareg, + f"DESC:datasets:test_registering_bad_relative_path_3_{link}", + "0.0.1", + old_location=data_path, + location_type="dataregistry", + relative_path=f".gen_paths/test/register/bad/relpath/{link}", + ) + + # Test registering a file or directory as '.gen_paths' (not allowed) + with pytest.raises(ValueError, match="Can't start relative path with '.gen_paths'"): + d_id = _insert_dataset_entry( + datareg, + f"DESC:datasets:test_registering_bad_relative_path_4_{link}", + "0.0.1", + old_location=data_path, + location_type="dataregistry", + relative_path=f".gen_paths", + ) @pytest.mark.parametrize("link", ["file1.txt", "directory1"]) def test_registering_deleted_relative_path(dummy_file, link): diff --git a/tests/unit_tests/test_registrar_util.py b/tests/unit_tests/test_registrar_util.py index 67acd392..4813acfc 100644 --- a/tests/unit_tests/test_registrar_util.py +++ b/tests/unit_tests/test_registrar_util.py @@ -159,17 +159,17 @@ def test_read_file(tmpdir, nchars, max_config_length, ans): @pytest.mark.parametrize( "name,version_string,ans", [ - ("mydataset", "1.1.1", "mydataset_1.1.1"), + ("mydataset", "1.1.1", ".gen_paths/mydataset_1.1.1"), ], ) -def test_relpath_from_name(name, version_string, ans): +def test_relpath_from_name(name, version_string, ans, old_location=None): """ Test dataset path construction Datasets should come back with the format: /// """ - tmp = _relpath_from_name(name, version_string) + tmp = _relpath_from_name(name, version_string, old_location) assert tmp == ans @pytest.mark.parametrize(