From 6e5bff1c69bdba16837714a13dedb61ca0a370a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 9 Nov 2023 14:39:40 -0800 Subject: [PATCH 001/420] Update 'store_object' method signature with default None values --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 87f652e7..9732575b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -404,8 +404,8 @@ def lookup_algo(algo): def store_object( self, - pid, - data, + pid=None, + data=None, additional_algorithm=None, checksum=None, checksum_algorithm=None, From efc116e6e2d8b132a7103b1baf7e1e5a054835c4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 9 Nov 2023 15:26:27 -0800 Subject: [PATCH 002/420] Add new private method '_store_data' --- src/hashstore/filehashstore.py | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9732575b..85141c2b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -467,6 +467,43 @@ def store_object( return object_metadata + def _store_data(self, data): + """Store a temporary object to HashStore that is ready to be tagged, and return the + tmp file name and a hex digest dictionary of the default algorithms. + """ + logging.debug("FileHashStore - store_object: Request to store object.") + + # Step 1: Store data + try: + # Ensure the data is a stream + stream = Stream(data) + + # Get the hex digest dictionary + with closing(stream): + ( + object_ref_pid_location, + obj_file_size, + hex_digest_dict, + ) = self._move_and_get_checksums(None, stream) + + object_metadata = ObjectMetadata( + object_ref_pid_location, obj_file_size, hex_digest_dict + ) + # The permanent address of the data stored is based on the data's checksum + cid = hex_digest_dict.get(self.algorithm) + logging.debug( + "FileHashStore - store_object: Successfully stored object with cid: %s", + cid, + ) + return object_metadata + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + "FileHashStore - store_object: failed to store object." + + f" Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid From 3831baca91dac8f3ece25c7b77e542a269e647b1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 9 Nov 2023 15:29:09 -0800 Subject: [PATCH 003/420] Refactor 'store_object' to only store data when pid is 'None' --- src/hashstore/filehashstore.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 85141c2b..8917843c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -444,14 +444,17 @@ def store_object( "FileHashStore - store_object: Attempting to store object for pid: %s", pid, ) - object_metadata = self.put_object( - pid, - data, - additional_algorithm=additional_algorithm_checked, - checksum=checksum, - checksum_algorithm=checksum_algorithm_checked, - file_size_to_validate=expected_object_size, - ) + if pid is None: + object_metadata = self._store_data(data) + else: + object_metadata = self.put_object( + pid, + data, + additional_algorithm=additional_algorithm_checked, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + file_size_to_validate=expected_object_size, + ) finally: # Release pid with self.object_lock: From 90c32391f2f866b691ce8c94123a5b3fddeb333b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 09:52:53 -0800 Subject: [PATCH 004/420] Refactor '_move_and_get_checksums' to store objects with their content identifiers --- src/hashstore/filehashstore.py | 122 ++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 54 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8917843c..16002f48 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -470,43 +470,6 @@ def store_object( return object_metadata - def _store_data(self, data): - """Store a temporary object to HashStore that is ready to be tagged, and return the - tmp file name and a hex digest dictionary of the default algorithms. - """ - logging.debug("FileHashStore - store_object: Request to store object.") - - # Step 1: Store data - try: - # Ensure the data is a stream - stream = Stream(data) - - # Get the hex digest dictionary - with closing(stream): - ( - object_ref_pid_location, - obj_file_size, - hex_digest_dict, - ) = self._move_and_get_checksums(None, stream) - - object_metadata = ObjectMetadata( - object_ref_pid_location, obj_file_size, hex_digest_dict - ) - # The permanent address of the data stored is based on the data's checksum - cid = hex_digest_dict.get(self.algorithm) - logging.debug( - "FileHashStore - store_object: Successfully stored object with cid: %s", - cid, - ) - return object_metadata - # pylint: disable=W0718 - except Exception as err: - exception_string = ( - "FileHashStore - store_object: failed to store object." - + f" Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid @@ -696,7 +659,7 @@ def put_object( Returns: object_metadata (ObjectMetadata): object that contains the object id, - object file size, duplicate file boolean and hex digest dictionary. + object file size and hex digest dictionary. """ stream = Stream(file) @@ -725,6 +688,61 @@ def put_object( ) return object_metadata + def _store_data(self, data): + """Store an object to HashStore and return the tmp file name and a hex digest + dictionary of the default algorithms. + + Args: + data (mixed): String or path to object. + + Raises: + IOError: If object fails to store + FileExistsError: If file already exists + + Returns: + object_metadata (ObjectMetadata): object that contains the object id, + object file size and hex digest dictionary. + """ + logging.debug("FileHashStore - store_object: Request to store object.") + + # TODO: Missing Tests + # - Test that this method returns hex digests and that they are correct + # - Test that objects are actually stored with their cid + # - Test that exception is raised when object fails to store + # - Test that exception is raised when object already exists + # - Test providing the data as a file path + # - Test providing the data as a stream + try: + # Ensure the data is a stream + stream = Stream(data) + + # Get the hex digest dictionary + with closing(stream): + ( + object_ref_pid_location, + obj_file_size, + hex_digest_dict, + ) = self._move_and_get_checksums(None, stream) + + object_metadata = ObjectMetadata( + object_ref_pid_location, obj_file_size, hex_digest_dict + ) + # The permanent address of the data stored is based on the data's checksum + cid = hex_digest_dict.get(self.algorithm) + logging.debug( + "FileHashStore - store_object: Successfully stored object with cid: %s", + cid, + ) + return object_metadata + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + "FileHashStore - store_object: failed to store object." + + f" Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise IOError(exception_string) from err + def _move_and_get_checksums( self, pid, @@ -756,21 +774,11 @@ def _move_and_get_checksums( file_size_to_validate (bytes, optional): Expected size of object Returns: - object_metadata (tuple): object id, object file size, duplicate file - boolean and hex digest dictionary. + object_metadata (tuple): object id, object file size and hex digest dictionary. """ - entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) - abs_file_path = self.build_abs_path(entity, object_cid, extension) - - # Only create tmp file to be moved if target destination doesn't exist - if os.path.isfile(abs_file_path): - exception_string = ( - "FileHashStore - _move_and_get_checksums: File already exists" - + f" for pid: {pid} at {abs_file_path}" - ) - logging.error(exception_string) - raise FileExistsError(exception_string) + # If the checksum algorithm is the same as the store algorithm, then we can + # determine whether the object exists or not to be efficient + # TODO # Create temporary file and calculate hex digests debug_msg = ( @@ -786,6 +794,11 @@ def _move_and_get_checksums( tmp_file_name, ) + # Objects are stored with their content identifier based on the store algorithm + entity = "objects" + object_cid = hex_digests.get(self.algorithm) + abs_file_path = self.build_abs_path(entity, object_cid, extension) + # Only move file if it doesn't exist. # Files are stored once and only once if not os.path.isfile(abs_file_path): @@ -850,12 +863,13 @@ def _move_and_get_checksums( raise else: # Else delete temporary file - warning_msg = ( + exception_string = ( f"FileHashStore - _move_and_get_checksums: Object exists at: {abs_file_path}," + " deleting temporary file." ) - logging.warning(warning_msg) + logging.error(exception_string) self.delete(entity, tmp_file_name) + raise FileExistsError(exception_string) return (object_cid, tmp_file_size, hex_digests) From f6a5cd17c249fd5d74790c6db28143d4ec97baaf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 10:02:25 -0800 Subject: [PATCH 005/420] Update tests after store_object refactor to store with object's content identifier --- tests/test_filehashstore.py | 5 ++-- tests/test_filehashstore_interface.py | 36 +++++++++++---------------- tests/test_hashstore_client.py | 2 +- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a2f0fdfe..6331ba5d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -245,7 +245,7 @@ def test_put_object_cid(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.put_object(pid, path) object_metadata_id = object_metadata.id - assert object_metadata_id == pids[pid]["object_cid"] + assert object_metadata_id == pids[pid][store.algorithm] def test_put_object_file_size(pids, store): @@ -321,8 +321,7 @@ def test_move_and_get_checksums_id(pids, store): _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - object_cid = store.get_sha256_hex_digest(pid) - assert move_id == object_cid + assert move_id == pids[pid][store.algorithm] def test_move_and_get_checksums_file_size(pids, store): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 92b125cb..329af168 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -41,7 +41,7 @@ def test_store_object(pids, store): syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert object_metadata.id == pids[pid]["object_cid"] + assert object_metadata.id == pids[pid][store.algorithm] assert store.count(entity) == 3 @@ -56,7 +56,7 @@ def test_store_object_files_path(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store.exists(entity, pids[pid]["object_cid"]) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -71,7 +71,7 @@ def test_store_object_files_string(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path_string) _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store.exists(entity, pids[pid]["object_cid"]) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -84,18 +84,17 @@ def test_store_object_files_input_stream(pids, store): input_stream = io.open(path, "rb") _object_metadata = store.store_object(pid, input_stream) input_stream.close() - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 def test_store_object_id(pids, store): - """Test store object returns expected id (object_cid).""" + """Test store object returns expected id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - assert object_metadata.id == pids[pid]["object_cid"] + assert object_metadata.id == pids[pid][store.algorithm] def test_store_object_obj_size(pids, store): @@ -192,11 +191,10 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) -def test_store_object_additional_algorithm_hyphen_lowercase(store): +def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): """Test store object with additional algorithm in lowercase.""" test_dir = "tests/testdata/" entity = "objects" @@ -209,11 +207,10 @@ def test_store_object_additional_algorithm_hyphen_lowercase(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) -def test_store_object_additional_algorithm_underscore(store): +def test_store_object_additional_algorithm_underscore(pids, store): """Test store object with additional algorithm with underscore.""" test_dir = "tests/testdata/" entity = "objects" @@ -226,8 +223,7 @@ def test_store_object_additional_algorithm_underscore(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - pid_hash = store.get_sha256_hex_digest(pid) - assert store.exists(entity, pid_hash) + assert store.exists(entity, pids[pid][store.algorithm]) def test_store_object_checksum_correct(store): @@ -356,7 +352,7 @@ def test_store_object_checksum_incorrect_checksum(store): ) -def test_store_object_duplicate_raises_error(store): +def test_store_object_duplicate_raises_error(pids, store): """Test store duplicate object throws FileExistsError.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -368,8 +364,7 @@ def test_store_object_duplicate_raises_error(store): with pytest.raises(FileExistsError): _object_metadata_two = store.store_object(pid, path) assert store.count(entity) == 1 - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) def test_store_object_with_obj_file_size(store, pids): @@ -415,7 +410,7 @@ def test_store_object_with_obj_file_size_zero(store, pids): store.store_object(pid, path, expected_object_size=obj_file_size) -def test_store_object_duplicates_threads(store): +def test_store_object_duplicates_threads(pids, store): """Test store object thread lock.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -442,8 +437,7 @@ def store_object_wrapper(pid, path): thread3.join() # One thread will succeed, file count must still be 1 assert store.count(entity) == 1 - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) assert file_exists_error_flag diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 7d73e524..f3f24477 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -66,7 +66,7 @@ def test_store_object(store, pids): sys.argv = chs_args client.main() - assert store.exists("objects", pids[pid]["object_cid"]) + assert store.exists("objects", pids[pid][store.algorithm]) def test_store_metadata(store, pids): From 5a69001d12c62531a683f6c08f2f2cef0f20f367 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 10:15:25 -0800 Subject: [PATCH 006/420] Update HashStore interface documentation --- src/hashstore/hashstore.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 6c704209..9b091ae1 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -5,9 +5,8 @@ class HashStore(ABC): - """HashStore is a content-addressable file management system that - utilizes a persistent identifier (PID) in the form of a hex digest - value to address files.""" + """HashStore is a content-addressable file management system that utilizes + an object's content identifier (hex digest/checksum) to address files.""" @staticmethod def version(): @@ -26,16 +25,15 @@ def store_object( expected_object_size, ): """The `store_object` method is responsible for the atomic storage of objects to - disk using a given InputStream and a persistent identifier (pid). Upon - successful storage, the method returns a ObjectMetadata object containing - relevant file information, such as the file's id (which can be used to locate the - object on disk), the file's size, and a hex digest map of algorithms and checksums. - `store_object` also ensures that an object is stored only once by synchronizing - multiple calls and rejecting calls to store duplicate objects. - - The file's id is determined by calculating the SHA-256 hex digest of the - provided pid, which is also used as the permanent address of the file. The - file's identifier is then sharded using a depth of 3 and width of 2, + disk using a given stream. Upon successful storage, the method returns a ObjectMetadata + object containing relevant file information, such as the file's id (which can be + used to locate the object on disk), the file's size, and a hex digest map of algorithms + and checksums. `store_object` also ensures that an object is stored only once by + synchronizing multiple calls and rejecting calls to store duplicate objects. + + The file's id is determined by calculating the object's content identifier based on + the store's default algorithm, which is also used as the permanent address of the file. + The file's identifier is then sharded using a depth of 3 and width of 2, delimited by '/' and concatenated to produce the final permanent address and is stored in the `/store_directory/objects/` directory. @@ -61,7 +59,7 @@ def store_object( Returns: object_metadata (ObjectMetadata): Object that contains the permanent address, - file size, duplicate file boolean and hex digest dictionary. + file size and hex digest dictionary. """ raise NotImplementedError() From d18eba9662c0e71b9f28bef167f7f526a23b5d51 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 10:58:49 -0800 Subject: [PATCH 007/420] Add new public API methods to HashStore interface 'tag_object' and 'find_object' --- src/hashstore/hashstore.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 9b091ae1..130c1304 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -63,6 +63,35 @@ def store_object( """ raise NotImplementedError() + @abstractmethod + def tag_object(self, pid, cid): + """The `tag_object` method creates references that allow objects stored in HashStore + to be discoverable. Retrieving, deleting or calculating a hex digest of an object is + based on a pid argument; and to proceed, we must be able to find the object associated + with the pid. + + Args: + pid (string): Authority-based or persistent identifier of object + cid (string): Content identifier of object + + Returns: + boolean: `True` upon successful tagging. + """ + raise NotImplementedError() + + @abstractmethod + def find_object(self, pid): + """The `find_object` method checks whether an object referenced by a pid exists + and returns the content identifier. + + Args: + pid (string): Authority-based or persistent identifier of object + + Returns: + cid (string): Content identifier of the object + """ + raise NotImplementedError() + @abstractmethod def store_metadata(self, pid, metadata, format_id): """The `store_metadata` method is responsible for adding and/or updating metadata @@ -87,9 +116,8 @@ def store_metadata(self, pid, metadata, format_id): @abstractmethod def retrieve_object(self, pid): """The `retrieve_object` method retrieves an object from disk using a given - persistent identifier (pid). If the object exists (determined by calculating - the object's permanent address using the SHA-256 hash of the given pid), the - method will open and return a buffered object stream ready to read from. + persistent identifier (pid). If the object exists, the method will open and return + a buffered object stream ready to read from. Args: pid (string): Authority-based identifier. From 78f84c3b08b27297a33256e2a74b08e5829ad809 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 11:04:04 -0800 Subject: [PATCH 008/420] Update HashStore initialization to create required 'refs' directory and subdirectories --- src/hashstore/filehashstore.py | 10 ++++++++++ tests/test_filehashstore.py | 3 +++ 2 files changed, 13 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 16002f48..fe6402b6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -104,10 +104,14 @@ def __init__(self, properties=None): # Complete initialization/instantiation by setting and creating store directories self.objects = self.root + "/objects" self.metadata = self.root + "/metadata" + self.refs = self.root + "/refs" if not os.path.exists(self.objects): self.create_path(self.objects + "/tmp") if not os.path.exists(self.metadata): self.create_path(self.metadata + "/tmp") + if not os.path.exists(self.refs): + self.create_path(self.refs + "/pids") + self.create_path(self.refs + "/cids") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -470,6 +474,12 @@ def store_object( return object_metadata + def tag_object(self, pid, cid): + return + + def find_object(self, pid): + return + def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 6331ba5d..778725b0 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -18,6 +18,9 @@ def test_init_directories_created(store): assert os.path.exists(store.objects + "/tmp") assert os.path.exists(store.metadata) assert os.path.exists(store.metadata + "/tmp") + assert os.path.exists(store.refs) + assert os.path.exists(store.refs + "/pids") + assert os.path.exists(store.refs + "/cids") def test_init_existing_store_incorrect_algorithm_format(store): From 0af3514b6e0ad1059cd43735e15e5beb8fd5be3b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 11:14:57 -0800 Subject: [PATCH 009/420] Add TODOs and pseudo code in 'FileHashStore' --- src/hashstore/filehashstore.py | 25 +++++++++++++++++++++---- tests/test_filehashstore.py | 4 ++-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index fe6402b6..8a581cc7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -110,8 +110,8 @@ def __init__(self, properties=None): if not os.path.exists(self.metadata): self.create_path(self.metadata + "/tmp") if not os.path.exists(self.refs): - self.create_path(self.refs + "/pids") - self.create_path(self.refs + "/cids") + self.create_path(self.refs + "/pid") + self.create_path(self.refs + "/cid") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -475,9 +475,19 @@ def store_object( return object_metadata def tag_object(self, pid, cid): + # Synchronize tag_object with a lock + # Acquire system-wide file lock on the cid to be evaluated + # Check to see if reference file already exists for the cid + # If it does, read the file and add the new pid on its own line + # If not, create the cid ref file '.../refs/cid' with the first line being the pid + # Then create the pid ref file in '.../refs/pid' with the cid as its content + # Release system-wide file lock on the cid + # Release initial lock return def find_object(self, pid): + # Get the path to the pid reference by calculating its hash in '.../refs/pid' + # Read the file to get the cid from the pid reference and return it return def store_metadata(self, pid, metadata, format_id=None): @@ -533,6 +543,8 @@ def retrieve_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "retrieve_object") + # TODO: Find object from the pid reference file + entity = "objects" object_cid = self.get_sha256_hex_digest(pid) object_exists = self.exists(entity, object_cid) @@ -586,6 +598,10 @@ def delete_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "delete_object") + # TODO: Also find the reference file and delete it if there's only one ref + # Else delete the pid in the cid refs file + # Also delete the pid ref file + entity = "objects" object_cid = self.get_sha256_hex_digest(pid) self.delete(entity, object_cid) @@ -622,6 +638,8 @@ def get_hex_digest(self, pid, algorithm): self._is_string_none_or_empty(pid, "pid", "get_hex_digest") self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") + # TODO: Find object from the pid reference file + entity = "objects" algorithm = self.clean_algorithm(algorithm) object_cid = self.get_sha256_hex_digest(pid) @@ -786,9 +804,8 @@ def _move_and_get_checksums( Returns: object_metadata (tuple): object id, object file size and hex digest dictionary. """ - # If the checksum algorithm is the same as the store algorithm, then we can + # TODO: If the checksum algorithm is the same as the store algorithm, then we can # determine whether the object exists or not to be efficient - # TODO # Create temporary file and calculate hex digests debug_msg = ( diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 778725b0..d0894978 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -19,8 +19,8 @@ def test_init_directories_created(store): assert os.path.exists(store.metadata) assert os.path.exists(store.metadata + "/tmp") assert os.path.exists(store.refs) - assert os.path.exists(store.refs + "/pids") - assert os.path.exists(store.refs + "/cids") + assert os.path.exists(store.refs + "/pid") + assert os.path.exists(store.refs + "/cid") def test_init_existing_store_incorrect_algorithm_format(store): From 6a32c457b5bf8c39f3cc437493ecd175d13f5a7c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 12:24:48 -0800 Subject: [PATCH 010/420] Rename 'put_object' method to '_store_and_validate_data' and update tests, and '_store_data' to 'store_data_only' --- src/hashstore/filehashstore.py | 13 ++++--- tests/test_filehashstore.py | 70 ++++++++++++++++++---------------- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8a581cc7..62c455eb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -449,9 +449,9 @@ def store_object( pid, ) if pid is None: - object_metadata = self._store_data(data) + object_metadata = self.store_data_only(data) else: - object_metadata = self.put_object( + object_metadata = self.store_and_validate_data( pid, data, additional_algorithm=additional_algorithm_checked, @@ -661,7 +661,7 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def put_object( + def store_and_validate_data( self, pid, file, @@ -671,7 +671,8 @@ def put_object( checksum_algorithm=None, file_size_to_validate=None, ): - """Store contents of `file` on disk using the hash of the given pid + """Store contents of `file` on disk using, validate the object's parameters if + provided and tag/reference the object. Args: pid (string): Authority-based identifier. \n @@ -716,8 +717,8 @@ def put_object( ) return object_metadata - def _store_data(self, data): - """Store an object to HashStore and return the tmp file name and a hex digest + def store_data_only(self, data): + """Store an object to HashStore and return the id and a hex digest dictionary of the default algorithms. Args: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index d0894978..8ddf75fb 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -112,7 +112,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.put_object(pid, path) + store.store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) properties = { "store_path": store.root, @@ -198,75 +198,75 @@ def test_set_default_algorithms_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.put_object(pid, path) + store.store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): # pylint: disable=W0212 store._set_default_algorithms() -def test_put_object_files_path(pids, store): +def test_store_and_validate_data_files_path(pids, store): """Test put objects with path object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) -def test_put_object_files_string(pids, store): +def test_store_and_validate_data_files_string(pids, store): """Test put objects with string.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) -def test_put_object_files_stream(pids, store): +def test_store_and_validate_data_files_stream(pids, store): """Test put objects with stream.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - object_metadata = store.put_object(pid, input_stream) + object_metadata = store.store_and_validate_data(pid, input_stream) input_stream.close() object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) assert store.count(entity) == 3 -def test_put_object_cid(pids, store): +def test_store_and_validate_data_cid(pids, store): """Check put returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id assert object_metadata_id == pids[pid][store.algorithm] -def test_put_object_file_size(pids, store): +def test_store_and_validate_data_file_size(pids, store): """Check put returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_size = object_metadata.obj_size assert object_size == pids[pid]["file_size_bytes"] -def test_put_object_hex_digests(pids, store): +def test_store_and_validate_data_hex_digests(pids, store): """Check put successfully generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_hex_digests = object_metadata.hex_digests assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] @@ -275,30 +275,34 @@ def test_put_object_hex_digests(pids, store): assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] -def test_put_object_additional_algorithm(pids, store): - """Check put_object returns additional algorithm in hex digests.""" +def test_store_and_validate_data_additional_algorithm(pids, store): + """Check store_and_validate_data returns additional algorithm in hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path, additional_algorithm=algo) + object_metadata = store.store_and_validate_data( + pid, path, additional_algorithm=algo + ) hex_digests = object_metadata.hex_digests sha224_hash = hex_digests.get(algo) assert sha224_hash == pids[pid][algo] -def test_put_object_with_correct_checksums(pids, store): - """Check put_object success with valid checksum and checksum algorithm supplied.""" +def test_store_and_validate_data_with_correct_checksums(pids, store): + """Check store_and_validate_data success with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" algo_checksum = pids[pid][algo] path = test_dir + pid.replace("/", "_") - store.put_object(pid, path, checksum=algo_checksum, checksum_algorithm=algo) + store.store_and_validate_data( + pid, path, checksum=algo_checksum, checksum_algorithm=algo + ) assert store.count("objects") == 3 -def test_put_object_with_incorrect_checksum(pids, store): +def test_store_and_validate_data_with_incorrect_checksum(pids, store): """Check put fails when bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" @@ -307,7 +311,9 @@ def test_put_object_with_incorrect_checksum(pids, store): algo_checksum = "badChecksumValue" path = test_dir + pid.replace("/", "_") with pytest.raises(ValueError): - store.put_object(pid, path, checksum=algo_checksum, checksum_algorithm=algo) + store.store_and_validate_data( + pid, path, checksum=algo_checksum, checksum_algorithm=algo + ) assert store.count(entity) == 0 @@ -634,7 +640,7 @@ def test_exists_with_object_metadata_id(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) assert store.exists(entity, object_metadata.id) @@ -644,7 +650,7 @@ def test_exists_with_sharded_path(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) assert store.exists(entity, object_metadata_shard_path) @@ -677,7 +683,7 @@ def test_open_objects(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id io_buffer = store.open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) @@ -690,7 +696,7 @@ def test_delete_by_object_metadata_id(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id store.delete(entity, object_metadata_id) assert store.count(entity) == 0 @@ -739,7 +745,7 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path @@ -802,7 +808,7 @@ def test_get_real_path_with_object_id(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) obj_abs_path = store.get_real_path(entity, object_metadata.id) assert os.path.exists(obj_abs_path) @@ -813,7 +819,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) obj_abs_path = store.get_real_path(entity, object_metadata_shard_path) @@ -839,7 +845,7 @@ def test_get_real_path_with_bad_entity(store, pids): entity = "bad_entity" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) with pytest.raises(ValueError): store.get_real_path(entity, object_metadata.id) @@ -850,7 +856,7 @@ def test_build_abs_path(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _ = store.put_object(pid, path) + _ = store.store_and_validate_data(pid, path) # pylint: disable=W0212 abs_path = store.build_abs_path(entity, pids[pid]["object_cid"]) assert abs_path @@ -862,7 +868,7 @@ def test_count(pids, store): entity = "objects" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") - store.put_object(pid, path_string) + store.store_and_validate_data(pid, path_string) assert store.count(entity) == 3 From 270e556725f4ec3805cff9c10341b7b994b73502 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 12:40:06 -0800 Subject: [PATCH 011/420] Add reference locks and skeleton code for 'tag_object' --- src/hashstore/filehashstore.py | 43 +++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 62c455eb..98cfdff9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -61,8 +61,10 @@ class FileHashStore(HashStore): time_out_sec = 1 object_lock = threading.Lock() metadata_lock = threading.Lock() + reference_lock = threading.Lock() object_locked_pids = [] metadata_locked_pids = [] + reference_locked_cids = [] def __init__(self, properties=None): if properties: @@ -475,14 +477,39 @@ def store_object( return object_metadata def tag_object(self, pid, cid): - # Synchronize tag_object with a lock - # Acquire system-wide file lock on the cid to be evaluated - # Check to see if reference file already exists for the cid - # If it does, read the file and add the new pid on its own line - # If not, create the cid ref file '.../refs/cid' with the first line being the pid - # Then create the pid ref file in '.../refs/pid' with the cid as its content - # Release system-wide file lock on the cid - # Release initial lock + # Wait for the cid to release if it's being tagged + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - tag_object: (cid) %s is currently being tagged. Waiting.", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - tag_object: Adding cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(cid) + try: + # Acquire system-wide file lock on the cid to be evaluated + # Check to see if reference file already exists for the cid + # If it does, read the file and add the new pid on its own line + # If not, create the cid ref file '.../refs/cid' with the first line being the pid + # Then create the pid ref file in '.../refs/pid' with the cid as its content + # Release system-wide file lock on the cid + # Release initial lock + print("Tag object") + finally: + # Release pid + with self.reference_lock: + logging.debug( + "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) + info_msg = f"FileHashStore - tag_object: Successfully tagged cid: {cid} with pid: {pid}" + logging.info(info_msg) return def find_object(self, pid): From d20f41e37ab46e7514d64aa955c8d320fa701d68 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 13:31:59 -0800 Subject: [PATCH 012/420] Fill out 'tag_object' skeleton, update 'get_store_path' method for 'refs' and add new empty method '_write_cid_reference' --- src/hashstore/filehashstore.py | 38 +++++++++++++++++++++++----------- tests/test_filehashstore.py | 8 +++++++ 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 98cfdff9..53741789 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -492,16 +492,18 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - # Acquire system-wide file lock on the cid to be evaluated # Check to see if reference file already exists for the cid - # If it does, read the file and add the new pid on its own line - # If not, create the cid ref file '.../refs/cid' with the first line being the pid - # Then create the pid ref file in '.../refs/pid' with the cid as its content - # Release system-wide file lock on the cid - # Release initial lock - print("Tag object") + entity = "refs" + ref_abs_path = self.build_abs_path(entity, cid) + if os.path.isfile(ref_abs_path): + # If it does, read the file and add the new pid on its own line + print("Add pid to reference file") + else: + # If not, create the cid ref file '.../refs/cid' with the first line being the pid + # Then create the pid ref file in '.../refs/pid' with the cid as its content + print("Create and tag reference file") finally: - # Release pid + # Release cid with self.reference_lock: logging.debug( "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", @@ -1030,6 +1032,16 @@ def delete_tmp_file(): ) logging.error(exception_string) + def _write_cid_reference(self, pid, cid): + """Write the reference file for the given content identifier (cid). A reference + file contains every pid that references a cid on a new line. + + Args: + pid (string): Authority-based or persistent identifier of object + cid (string): Content identifier of object + """ + print("Writing reference") + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. @@ -1381,9 +1393,11 @@ def get_store_path(self, entity): return Path(self.objects) elif entity == "metadata": return Path(self.metadata) + elif entity == "refs": + return Path(self.refs) else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" + f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) def exists(self, entity, file): @@ -1554,18 +1568,18 @@ def get_real_path(self, entity, file): # Could not determine a match. return None - def build_abs_path(self, entity, cid, extension=""): + def build_abs_path(self, entity, hash_id, extension=""): """Build the absolute file path for a given hash id with an optional file extension. Args: entity (str): Desired entity type (ex. "objects", "metadata"). \n - cid (str): A hash id to build a file path for. \n + hash_id (str): A hash id to build a file path for. \n extension (str): An optional file extension to append to the file path. Returns: absolute_path (str): An absolute file path for the specified hash id. """ - paths = self.shard(cid) + paths = self.shard(hash_id) root_dir = self.get_store_path(entity) if extension and not extension.startswith(os.extsep): diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 8ddf75fb..f3184c20 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -634,6 +634,14 @@ def test_get_store_path_metadata(store): assert path_metadata_string.endswith("/metacat/metadata") +def test_get_store_path_refs(store): + """Check get_store_path for refs path.""" + # pylint: disable=W0212 + path_metadata = store.get_store_path("refs") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/refs") + + def test_exists_with_object_metadata_id(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" From 73a2c66d7086fbfbd64bcdf0f14b3e718b5d196c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 14:17:16 -0800 Subject: [PATCH 013/420] Fix test for 'build_abs_path' in FileHashStore --- tests/test_filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index f3184c20..a1f80fc2 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -866,8 +866,8 @@ def test_build_abs_path(store, pids): path = test_dir + pid.replace("/", "_") _ = store.store_and_validate_data(pid, path) # pylint: disable=W0212 - abs_path = store.build_abs_path(entity, pids[pid]["object_cid"]) - assert abs_path + abs_path = store.build_abs_path(entity, pids[pid][store.algorithm]) + assert os.path.exists(abs_path) def test_count(pids, store): From f44871ece293efe01e12f189d8ae421fdafaef54 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 15:39:33 -0800 Subject: [PATCH 014/420] Add new fcntl import, code method 'write_cid_reference' and add new pytests --- src/hashstore/filehashstore.py | 39 ++++++++++++++++++++++++++++------ tests/test_filehashstore.py | 25 ++++++++++++++++++++++ 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 53741789..ec8e9804 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -10,6 +10,7 @@ from pathlib import Path from contextlib import closing from tempfile import NamedTemporaryFile +import fcntl import yaml from hashstore import HashStore, ObjectMetadata @@ -494,14 +495,18 @@ def tag_object(self, pid, cid): try: # Check to see if reference file already exists for the cid entity = "refs" - ref_abs_path = self.build_abs_path(entity, cid) - if os.path.isfile(ref_abs_path): + cid_ref_abs_path = self.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + if os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line print("Add pid to reference file") else: - # If not, create the cid ref file '.../refs/cid' with the first line being the pid + # If not, create the cid ref file in '.../refs/cid' and write the pid + self.create_path(os.path.dirname(cid_ref_abs_path)) + self.write_cid_reference(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content - print("Create and tag reference file") + # TODO: Write the pid ref file that contains the cid finally: # Release cid with self.reference_lock: @@ -1032,15 +1037,35 @@ def delete_tmp_file(): ) logging.error(exception_string) - def _write_cid_reference(self, pid, cid): + def write_cid_reference(self, cid_ref_abs_path, pid): """Write the reference file for the given content identifier (cid). A reference file contains every pid that references a cid on a new line. Args: + cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object - cid (string): Content identifier of object """ - print("Writing reference") + info_msg = ( + f"FileHashStore - _write_cid_reference: Writing pid ({pid}) into cid reference" + + f" file: {cid_ref_abs_path}" + ) + logging.info(info_msg) + + try: + with open(cid_ref_abs_path, "w", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + cid_ref_file.write(pid + "\n") + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + except Exception as err: + exception_string = ( + "FileHashStore - _write_cid_reference: failed to write reference for cid:" + + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise IOError(exception_string) from err def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a1f80fc2..df008cf8 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -543,6 +543,31 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): input_stream.close() +def test_write_cid_reference(pids, store): + """Test that write_cid_reference writes a reference file""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + assert os.path.exists(cid_ref_abs_path) + + +def test_write_cid_reference_content(pids, store): + """Test that write_cid_reference writes the expected content""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 4c01344838dcd12ad4b9086236573e17f3429b8d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 09:30:32 -0800 Subject: [PATCH 015/420] Add missing assertion statement to 'write_cid_reference' test for verifying content --- tests/test_filehashstore.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index df008cf8..b9efdd4b 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -567,6 +567,11 @@ def test_write_cid_reference_content(pids, store): store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_reference(cid_ref_abs_path, pid) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert pid == cid_ref_file_pid.replace("\n", "") + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" From 0178f28b2afeaf126485772586566437dd76e7ac Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 10:54:18 -0800 Subject: [PATCH 016/420] Add new method 'update_cid_reference' with new pytests --- src/hashstore/filehashstore.py | 42 ++++++++++++++++++++++++++++++- tests/test_filehashstore.py | 46 ++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ec8e9804..06cf213b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1062,7 +1062,47 @@ def write_cid_reference(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( "FileHashStore - _write_cid_reference: failed to write reference for cid:" - + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise IOError(exception_string) from err + + def update_cid_reference(self, cid_ref_abs_path, pid): + """Update an existing cid reference file with the given pid. Every pid in a reference + file is found on its own line. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + pid (string): Authority-based or persistent identifier of object + """ + info_msg = ( + f"FileHashStore - update_cid_reference: Adding pid ({pid}) into cid reference" + + f" file: {cid_ref_abs_path}" + ) + logging.info(info_msg) + + try: + with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + # Read the ref file to see if the pid is already referencing the cid + cid_ref_file_content = cid_ref_file.read() + + if pid in cid_ref_file_content: + err_msg = ( + f"FileHashStore - update_cid_reference: pid ({pid}) already reference in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + else: + cid_ref_file.write(pid + "\n") + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + except Exception as err: + exception_string = ( + "FileHashStore - update_cid_reference: failed to update reference for cid:" + + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise IOError(exception_string) from err diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index b9efdd4b..a69baff5 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -573,6 +573,52 @@ def test_write_cid_reference_content(pids, store): assert pid == cid_ref_file_pid.replace("\n", "") +def test_update_cid_reference_content(pids, store): + """Test that update_cid_reference updates the ref file as expected""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_reference(cid_ref_abs_path, pid_other) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == pid_other + + +def test_update_cid_reference_content_multiple(pids, store): + """Test that update_cid_reference multiple updates""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + cid_reference_list = [pid] + for i in range(0, 5): + store.update_cid_reference(cid_ref_abs_path, f"dou.test.{i}") + cid_reference_list.append(f"dou.test.{i}") + + line_count = 0 + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + line_count += 1 + value = line.strip() + assert value in cid_reference_list + + assert line_count == 6 + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 21233afbac7c90e80390c66ad401febde6c5ed6e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 11:23:51 -0800 Subject: [PATCH 017/420] Add new 'delete_cid_reference_pid' method with new pytests --- src/hashstore/filehashstore.py | 49 ++++++++++++++++++++++++++++++++-- tests/test_filehashstore.py | 47 +++++++++++++++++++++++++++++--- 2 files changed, 90 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 06cf213b..b1d7bc93 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1059,13 +1059,14 @@ def write_cid_reference(self, cid_ref_abs_path, pid): # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) return + except Exception as err: exception_string = ( "FileHashStore - _write_cid_reference: failed to write reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - raise IOError(exception_string) from err + raise err def update_cid_reference(self, cid_ref_abs_path, pid): """Update an existing cid reference file with the given pid. Every pid in a reference @@ -1099,13 +1100,57 @@ def update_cid_reference(self, cid_ref_abs_path, pid): # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) return + except Exception as err: exception_string = ( "FileHashStore - update_cid_reference: failed to update reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - raise IOError(exception_string) from err + raise err + + def delete_cid_reference_pid(self, cid_ref_abs_path, pid): + """Delete a pid in a cid reference file. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + pid (string): Authority-based or persistent identifier of object + """ + info_msg = ( + f"FileHashStore - delete_cid_reference_pid: Deleting pid ({pid}) from cid reference" + + f" file: {cid_ref_abs_path}" + ) + logging.info(info_msg) + + try: + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + # Read the ref file to see if the pid is already referencing the cid + cid_ref_file_content = cid_ref_file.read() + + if pid not in cid_ref_file_content: + err_msg = ( + f"FileHashStore - delete_cid_reference_pid: pid ({pid}) does not exist in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + + with open(cid_ref_abs_path, "w", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + cid_ref_file.write(cid_ref_file_content.replace(pid + "\n", "")) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + + return + + except Exception as err: + exception_string = ( + "FileHashStore - delete_cid_reference_pid: failed to update reference for cid:" + + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a69baff5..f3b7d5f9 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -544,7 +544,7 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): def test_write_cid_reference(pids, store): - """Test that write_cid_reference writes a reference file""" + """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -557,7 +557,7 @@ def test_write_cid_reference(pids, store): def test_write_cid_reference_content(pids, store): - """Test that write_cid_reference writes the expected content""" + """Test that write_cid_reference writes the expected content.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -574,7 +574,7 @@ def test_write_cid_reference_content(pids, store): def test_update_cid_reference_content(pids, store): - """Test that update_cid_reference updates the ref file as expected""" + """Test that update_cid_reference updates the ref file as expected.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -594,7 +594,7 @@ def test_update_cid_reference_content(pids, store): def test_update_cid_reference_content_multiple(pids, store): - """Test that update_cid_reference multiple updates""" + """Test that update_cid_reference adds multiple references successfully.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -619,6 +619,45 @@ def test_update_cid_reference_content_multiple(pids, store): assert line_count == 6 +def test_delete_cid_reference_pid(pids, store): + """Test that delete_cid_reference deletes the given pid from the ref file.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_reference(cid_ref_abs_path, pid_other) + store.delete_cid_reference_pid(cid_ref_abs_path, pid) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + print(value) + assert value == pid_other + + +def test_delete_cid_reference_pid_not_found(pids, store): + """Test that delete_cid_reference raises exception when pid not found.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_reference(cid_ref_abs_path, pid_other) + with pytest.raises(ValueError): + store.delete_cid_reference_pid(cid_ref_abs_path, "dou.not.found.1") + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 1fc158f5fb0a553406dc2d6758dce14d37be1d1f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 11:58:15 -0800 Subject: [PATCH 018/420] Rename refs related method names and update pytests --- src/hashstore/filehashstore.py | 29 ++++++++++++----------- tests/test_filehashstore.py | 42 +++++++++++++++++----------------- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b1d7bc93..270aec98 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -504,7 +504,7 @@ def tag_object(self, pid, cid): else: # If not, create the cid ref file in '.../refs/cid' and write the pid self.create_path(os.path.dirname(cid_ref_abs_path)) - self.write_cid_reference(cid_ref_abs_path, pid) + self.write_cid_ref_file(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content # TODO: Write the pid ref file that contains the cid finally: @@ -1037,7 +1037,7 @@ def delete_tmp_file(): ) logging.error(exception_string) - def write_cid_reference(self, cid_ref_abs_path, pid): + def write_cid_refs_file(self, cid_ref_abs_path, pid): """Write the reference file for the given content identifier (cid). A reference file contains every pid that references a cid on a new line. @@ -1046,7 +1046,7 @@ def write_cid_reference(self, cid_ref_abs_path, pid): pid (string): Authority-based or persistent identifier of object """ info_msg = ( - f"FileHashStore - _write_cid_reference: Writing pid ({pid}) into cid reference" + f"FileHashStore - write_cid_refs_file: Writing pid ({pid}) into cid reference" + f" file: {cid_ref_abs_path}" ) logging.info(info_msg) @@ -1062,22 +1062,21 @@ def write_cid_reference(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - _write_cid_reference: failed to write reference for cid:" + "FileHashStore - write_cid_refs_file: failed to write reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - def update_cid_reference(self, cid_ref_abs_path, pid): - """Update an existing cid reference file with the given pid. Every pid in a reference - file is found on its own line. + def update_cid_refs(self, cid_ref_abs_path, pid): + """Update an existing cid reference file with the given pid. Args: cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object """ info_msg = ( - f"FileHashStore - update_cid_reference: Adding pid ({pid}) into cid reference" + f"FileHashStore - update_cid_refs: Adding pid ({pid}) into cid reference" + f" file: {cid_ref_abs_path}" ) logging.info(info_msg) @@ -1090,7 +1089,7 @@ def update_cid_reference(self, cid_ref_abs_path, pid): if pid in cid_ref_file_content: err_msg = ( - f"FileHashStore - update_cid_reference: pid ({pid}) already reference in" + f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" + f" cid reference file: {cid_ref_abs_path} " ) raise ValueError(err_msg) @@ -1103,21 +1102,21 @@ def update_cid_reference(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - update_cid_reference: failed to update reference for cid:" + "FileHashStore - update_cid_refs: failed to update reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - def delete_cid_reference_pid(self, cid_ref_abs_path, pid): - """Delete a pid in a cid reference file. + def delete_cid_refs_pid(self, cid_ref_abs_path, pid): + """Delete a pid from a cid reference file. Args: cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object """ info_msg = ( - f"FileHashStore - delete_cid_reference_pid: Deleting pid ({pid}) from cid reference" + f"FileHashStore - delete_cid_refs_pid: Deleting pid ({pid}) from cid reference" + f" file: {cid_ref_abs_path}" ) logging.info(info_msg) @@ -1130,7 +1129,7 @@ def delete_cid_reference_pid(self, cid_ref_abs_path, pid): if pid not in cid_ref_file_content: err_msg = ( - f"FileHashStore - delete_cid_reference_pid: pid ({pid}) does not exist in" + f"FileHashStore - delete_cid_refs_pid: pid ({pid}) does not exist in" + f" cid reference file: {cid_ref_abs_path} " ) raise ValueError(err_msg) @@ -1146,7 +1145,7 @@ def delete_cid_reference_pid(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - delete_cid_reference_pid: failed to update reference for cid:" + "FileHashStore - delete_cid_refs_pid: failed to update reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index f3b7d5f9..472f29e1 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -543,7 +543,7 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): input_stream.close() -def test_write_cid_reference(pids, store): +def test_write_cid_ref_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): entity = "refs" @@ -552,12 +552,12 @@ def test_write_cid_reference(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) assert os.path.exists(cid_ref_abs_path) -def test_write_cid_reference_content(pids, store): - """Test that write_cid_reference writes the expected content.""" +def test_write_cid_ref_file_content(pids, store): + """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -565,7 +565,7 @@ def test_write_cid_reference_content(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() @@ -573,8 +573,8 @@ def test_write_cid_reference_content(pids, store): assert pid == cid_ref_file_pid.replace("\n", "") -def test_update_cid_reference_content(pids, store): - """Test that update_cid_reference updates the ref file as expected.""" +def test_update_cid_ref_content(pids, store): + """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -582,10 +582,10 @@ def test_update_cid_reference_content(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_reference(cid_ref_abs_path, pid_other) + store.update_cid_refs(cid_ref_abs_path, pid_other) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -593,8 +593,8 @@ def test_update_cid_reference_content(pids, store): assert value == pid or value == pid_other -def test_update_cid_reference_content_multiple(pids, store): - """Test that update_cid_reference adds multiple references successfully.""" +def test_update_cid_ref_content_multiple(pids, store): + """Test that update_cid_ref adds multiple references successfully.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -602,11 +602,11 @@ def test_update_cid_reference_content_multiple(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) cid_reference_list = [pid] for i in range(0, 5): - store.update_cid_reference(cid_ref_abs_path, f"dou.test.{i}") + store.update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") cid_reference_list.append(f"dou.test.{i}") line_count = 0 @@ -619,7 +619,7 @@ def test_update_cid_reference_content_multiple(pids, store): assert line_count == 6 -def test_delete_cid_reference_pid(pids, store): +def test_delete_cid_ref_pid(pids, store): """Test that delete_cid_reference deletes the given pid from the ref file.""" for pid in pids.keys(): entity = "refs" @@ -628,11 +628,11 @@ def test_delete_cid_reference_pid(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_reference(cid_ref_abs_path, pid_other) - store.delete_cid_reference_pid(cid_ref_abs_path, pid) + store.update_cid_refs(cid_ref_abs_path, pid_other) + store.delete_cid_refs_pid(cid_ref_abs_path, pid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -641,7 +641,7 @@ def test_delete_cid_reference_pid(pids, store): assert value == pid_other -def test_delete_cid_reference_pid_not_found(pids, store): +def test_delete_cid_ref_pid_pid_not_found(pids, store): """Test that delete_cid_reference raises exception when pid not found.""" for pid in pids.keys(): entity = "refs" @@ -650,12 +650,12 @@ def test_delete_cid_reference_pid_not_found(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_reference(cid_ref_abs_path, pid_other) + store.update_cid_refs(cid_ref_abs_path, pid_other) with pytest.raises(ValueError): - store.delete_cid_reference_pid(cid_ref_abs_path, "dou.not.found.1") + store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") def test_put_metadata_with_path(pids, store): From b7833f0639bd7084e51e2bbf4e4ed15082d751fa Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 12:15:46 -0800 Subject: [PATCH 019/420] Add new 'delete_cid_refs_file' method with new pytests --- src/hashstore/filehashstore.py | 40 ++++++++++++++++++++++++++++++- tests/test_filehashstore.py | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 270aec98..3d94277c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -504,7 +504,7 @@ def tag_object(self, pid, cid): else: # If not, create the cid ref file in '.../refs/cid' and write the pid self.create_path(os.path.dirname(cid_ref_abs_path)) - self.write_cid_ref_file(cid_ref_abs_path, pid) + self.write_cid_refs_file(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content # TODO: Write the pid ref file that contains the cid finally: @@ -1151,6 +1151,44 @@ def delete_cid_refs_pid(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err + def delete_cid_refs_file(self, cid_ref_abs_path): + """Delete a cid reference file. There must be no references remaining. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + pid (string): Authority-based or persistent identifier of object + """ + info_msg = ( + "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", + cid_ref_abs_path, + ) + logging.info(info_msg) + + try: + if not os.path.exists(cid_ref_abs_path): + err_msg = ( + "FileHashStore - delete_cid_refs_file: Cid reference file not found: %s", + cid_ref_abs_path, + ) + raise FileNotFoundError(err_msg) + if os.path.getsize(cid_ref_abs_path) != 0: + err_msg = ( + "FileHashStore - delete_cid_refs_file: Failed to delete cid reference file." + + f" File is not empty: {cid_ref_abs_path} " + ) + raise OSError(err_msg) + else: + os.remove(cid_ref_abs_path) + return + + except Exception as err: + exception_string = ( + "FileHashStore - delete_cid_refs_file: failed to delete reference file:" + + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 472f29e1..caee71fe 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -658,6 +658,50 @@ def test_delete_cid_ref_pid_pid_not_found(pids, store): store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") +def test_delete_cid_ref_pid_file(pids, store): + """Test that delete_cid_refs_file deletes a reference file.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + store.delete_cid_refs_pid(cid_ref_abs_path, pid) + store.delete_cid_refs_file(cid_ref_abs_path) + + assert not os.path.exists(cid_ref_abs_path) + + +def test_delete_cid_ref_pid_file_not_empty(pids, store): + """Test that delete_cid_refs_file raises an exception when refs file not empty.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + with pytest.raises(OSError): + store.delete_cid_refs_file(cid_ref_abs_path) + + +def test_delete_cid_ref_pid_file_not_found(pids, store): + """Test that delete_cid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + + with pytest.raises(FileNotFoundError): + store.delete_cid_refs_file(cid_ref_abs_path) + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From d4e8274d1bfcaefcccd4dc6b704144eb7976cfe1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 12:24:55 -0800 Subject: [PATCH 020/420] Add missing docstring for 'tag_object' --- src/hashstore/filehashstore.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3d94277c..41fc822e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,6 +478,12 @@ def store_object( return object_metadata def tag_object(self, pid, cid): + """Tag an object that has been stored with a pid reference. + + Args: + pid (string): Authority-based or persistent identifier of object + cid (string): Content identifier + """ # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -500,7 +506,7 @@ def tag_object(self, pid, cid): ) if os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line - print("Add pid to reference file") + self.update_cid_refs(cid_ref_abs_path, pid) else: # If not, create the cid ref file in '.../refs/cid' and write the pid self.create_path(os.path.dirname(cid_ref_abs_path)) From 28d46718ec67ea5c0042965b22343c33cc8115e3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 12:53:03 -0800 Subject: [PATCH 021/420] Add new 'write_pid_refs_file' method with new pytests --- src/hashstore/filehashstore.py | 52 ++++++++++++++++++++++++++++++- tests/test_filehashstore.py | 56 ++++++++++++++++++++++++++-------- 2 files changed, 95 insertions(+), 13 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 41fc822e..3b6b0fb9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -484,6 +484,7 @@ def tag_object(self, pid, cid): pid (string): Authority-based or persistent identifier of object cid (string): Content identifier """ + # TODO: Write tests for this method # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -512,7 +513,11 @@ def tag_object(self, pid, cid): self.create_path(os.path.dirname(cid_ref_abs_path)) self.write_cid_refs_file(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content - # TODO: Write the pid ref file that contains the cid + pid_hash = self.computehash(pid, self.algorithm) + pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + self.write_pid_refs_file(pid_ref_abs_path, cid) finally: # Release cid with self.reference_lock: @@ -1195,6 +1200,51 @@ def delete_cid_refs_file(self, cid_ref_abs_path): logging.error(exception_string) raise err + def write_pid_refs_file(self, pid_ref_abs_path, cid): + """Write the reference file for the given pid (persistent identifier). A reference + file for a pid contains the cid that it references. Its permanent address is the pid + hash with HashStore's default store algorithm and follows its directory structure. + + Args: + pid_ref_abs_path (string): Absolute path to the pid ref file + cid (string): Content identifier + """ + info_msg = ( + f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into pid reference" + + f" file: {pid_ref_abs_path}" + ) + logging.info(info_msg) + + if os.path.exists(pid_ref_abs_path): + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + if pid_refs_cid == cid: + return + else: + exception_string = ( + "FileHashStore - write_pid_refs_file: pid reference file exists but" + + f" cid ({cid}) is different from cid stored ({pid_refs_cid})." + ) + logging.error(exception_string) + raise ValueError(exception_string) + else: + try: + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + fcntl.flock(pid_ref_file, fcntl.LOCK_EX) + pid_ref_file.write(cid) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + + except Exception as err: + exception_string = ( + "FileHashStore - write_pid_refs_file: failed to write pid reference file:" + + f" {pid_ref_abs_path} for cid: {cid}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index caee71fe..319f0c87 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -543,7 +543,7 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): input_stream.close() -def test_write_cid_ref_file(pids, store): +def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): entity = "refs" @@ -556,7 +556,7 @@ def test_write_cid_ref_file(pids, store): assert os.path.exists(cid_ref_abs_path) -def test_write_cid_ref_file_content(pids, store): +def test_write_cid_refs_file_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): entity = "refs" @@ -573,7 +573,7 @@ def test_write_cid_ref_file_content(pids, store): assert pid == cid_ref_file_pid.replace("\n", "") -def test_update_cid_ref_content(pids, store): +def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): entity = "refs" @@ -593,8 +593,8 @@ def test_update_cid_ref_content(pids, store): assert value == pid or value == pid_other -def test_update_cid_ref_content_multiple(pids, store): - """Test that update_cid_ref adds multiple references successfully.""" +def test_update_cid_refs_content_multiple(pids, store): + """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -619,8 +619,8 @@ def test_update_cid_ref_content_multiple(pids, store): assert line_count == 6 -def test_delete_cid_ref_pid(pids, store): - """Test that delete_cid_reference deletes the given pid from the ref file.""" +def test_delete_cid_refs_pid(pids, store): + """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -641,8 +641,8 @@ def test_delete_cid_ref_pid(pids, store): assert value == pid_other -def test_delete_cid_ref_pid_pid_not_found(pids, store): - """Test that delete_cid_reference raises exception when pid not found.""" +def test_delete_cid_refs_pid_pid_not_found(pids, store): + """Test that delete_cid_refs_pid raises exception when pid not found.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -658,7 +658,7 @@ def test_delete_cid_ref_pid_pid_not_found(pids, store): store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") -def test_delete_cid_ref_pid_file(pids, store): +def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_file deletes a reference file.""" for pid in pids.keys(): entity = "refs" @@ -674,7 +674,7 @@ def test_delete_cid_ref_pid_file(pids, store): assert not os.path.exists(cid_ref_abs_path) -def test_delete_cid_ref_pid_file_not_empty(pids, store): +def test_delete_cid_refs_pid_file_not_empty(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not empty.""" for pid in pids.keys(): entity = "refs" @@ -689,7 +689,7 @@ def test_delete_cid_ref_pid_file_not_empty(pids, store): store.delete_cid_refs_file(cid_ref_abs_path) -def test_delete_cid_ref_pid_file_not_found(pids, store): +def test_delete_cid_refs_pid_file_not_found(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): entity = "refs" @@ -702,6 +702,38 @@ def test_delete_cid_ref_pid_file_not_found(pids, store): store.delete_cid_refs_file(cid_ref_abs_path) +def test_write_pid_refs_file(pids, store): + """Test that write_pid_refs_file writes a reference file.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + assert os.path.exists(pid_ref_abs_path) + + +def test_write_pid_refs_file_content(pids, store): + """Test that write_pid_refs_file writes the expected content.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + assert cid == pid_refs_cid + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 5689c3c7f961a6d82893053d6ae25ca6e08c4dcf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 12:57:02 -0800 Subject: [PATCH 022/420] Add new pytestes for 'write_pid_refs_file' method --- tests/test_filehashstore.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 319f0c87..569f0fae 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -734,6 +734,42 @@ def test_write_pid_refs_file_content(pids, store): assert cid == pid_refs_cid +def test_write_pid_refs_file_exists(pids, store): + """Test that write_pid_refs_file returns when ref already exists and the + cid given is the same.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + # This should not write and return + store.write_pid_refs_file(pid_ref_abs_path, cid) + + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + assert cid == pid_refs_cid + + +def test_write_pid_refs_file_exists_different_cid(pids, store): + """Test that write_pid_refs_file returns when ref already exists and the + cid given is the same.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + with pytest.raises(ValueError): + store.write_pid_refs_file(pid_ref_abs_path, "abc123") + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 9f5cb601efcb513249031f9bde3d7df25a0d3f98 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:04:21 -0800 Subject: [PATCH 023/420] Add new method 'delete_pid_refs_file' with new pytests --- src/hashstore/filehashstore.py | 32 +++++++++++++++++++++++++++++++- tests/test_filehashstore.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3b6b0fb9..a5f64b6c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1167,7 +1167,6 @@ def delete_cid_refs_file(self, cid_ref_abs_path): Args: cid_ref_abs_path (string): Absolute path to the cid ref file - pid (string): Authority-based or persistent identifier of object """ info_msg = ( "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", @@ -1245,6 +1244,37 @@ def write_pid_refs_file(self, pid_ref_abs_path, cid): logging.error(exception_string) raise err + def delete_pid_refs_file(self, pid_ref_abs_path): + """Delete a pid reference file. + + Args: + pid_ref_abs_path (string): Absolute path to the pid ref file + """ + info_msg = ( + "FileHashStore - delete_pid_refs_file: Deleting reference file: %s", + pid_ref_abs_path, + ) + logging.info(info_msg) + + try: + if not os.path.exists(pid_ref_abs_path): + err_msg = ( + "FileHashStore - delete_pid_refs_file: pid reference file not found: %s", + pid_ref_abs_path, + ) + raise FileNotFoundError(err_msg) + else: + os.remove(pid_ref_abs_path) + return + + except Exception as err: + exception_string = ( + "FileHashStore - delete_pid_refs_file: failed to delete reference file:" + + f" {pid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 569f0fae..eb63fcca 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -770,6 +770,36 @@ def test_write_pid_refs_file_exists_different_cid(pids, store): with pytest.raises(ValueError): store.write_pid_refs_file(pid_ref_abs_path, "abc123") + +def test_delete_pid_refs_file(pids, store): + """Test that delete_pid_refs_file deletes a reference file.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + store.delete_pid_refs_file(pid_ref_abs_path) + + assert not os.path.exists(pid_ref_abs_path) + + +def test_delete_pid_refs_file_file_not_found(pids, store): + """Test that delete_pid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + entity = "refs" + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + + with pytest.raises(FileNotFoundError): + store.delete_cid_refs_file(pid_ref_abs_path) + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 9c6509e16484511a57eb388c07221a601200c3da Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:12:43 -0800 Subject: [PATCH 024/420] Update --run-slow pytests --- tests/test_filehashstore_interface.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 329af168..1f0fef3d 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -511,8 +511,7 @@ def test_store_object_large_file(store): pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) object_metadata_id = object_metadata.id - pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert object_metadata_id == pid_sha256_hex_digest + assert object_metadata_id == object_metadata.hex_digests.get("sha256") @slow_test @@ -531,8 +530,7 @@ def test_store_object_sparse_large_file(store): pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) object_metadata_id = object_metadata.id - pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert object_metadata_id == pid_sha256_hex_digest + assert object_metadata_id == object_metadata.hex_digests.get("sha256") def test_store_metadata(pids, store): From 2738d3fb8bcf166160f3055a8caf20fa5145591f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:20:27 -0800 Subject: [PATCH 025/420] Code 'find_object' method, missing pytests --- src/hashstore/filehashstore.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a5f64b6c..8368e0f3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -531,9 +531,30 @@ def tag_object(self, pid, cid): return def find_object(self, pid): + logging.debug( + "FileHashStore - find_object: Request to find object for for pid: %s", pid + ) + # TODO: Write tests for this method + self._is_string_none_or_empty(pid, "pid", "find_object") + # Get the path to the pid reference by calculating its hash in '.../refs/pid' - # Read the file to get the cid from the pid reference and return it - return + entity = "refs" + pid_hash = self.computehash(pid, self.algorithm) + pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + if not os.path.exists(pid_ref_abs_path): + err_msg = ( + f"FileHashStore - find_object: pid ({pid}) reference file not found: " + + pid_ref_abs_path, + ) + raise FileNotFoundError(err_msg) + else: + # Read the file to get the cid from the pid reference + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + return pid_refs_cid def store_metadata(self, pid, metadata, format_id=None): logging.debug( From 2e03a6a01371c76731e0f0ea2e922af38ac1cdd8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:29:46 -0800 Subject: [PATCH 026/420] Fix bug in 'tag_object', refactor 'retrieve_object' method and update pytests --- src/hashstore/filehashstore.py | 8 +++----- tests/test_filehashstore_interface.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8368e0f3..b69dffc0 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -517,6 +517,7 @@ def tag_object(self, pid, cid): pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( "/refs/", "/refs/pid/" ) + self.create_path(os.path.dirname(pid_ref_abs_path)) self.write_pid_refs_file(pid_ref_abs_path, cid) finally: # Release cid @@ -609,13 +610,10 @@ def retrieve_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "retrieve_object") - # TODO: Find object from the pid reference file - + object_cid = self.find_object(pid) entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) - object_exists = self.exists(entity, object_cid) - if object_exists: + if object_cid: logging.debug( "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", pid, diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 1f0fef3d..1da70497 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -705,6 +705,7 @@ def test_retrieve_object(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store.computehash(obj_stream) @@ -723,7 +724,7 @@ def test_retrieve_object_pid_invalid(store): """Test retrieve_object raises error when supplied with bad pid.""" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): store.retrieve_object(pid_does_not_exist) From 5b6a15fca61265c70f5924e8806c9d46d906633a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:33:18 -0800 Subject: [PATCH 027/420] Fix retrieve_object pytest in test_hashstore_client --- tests/test_hashstore_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index f3f24477..999f26ad 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -108,7 +108,8 @@ def test_retrieve_objects(capsys, pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) client_module_path = f"{client_directory}/client.py" test_store = store.root From e698c4047ff769c44e593f0058a39393c13d7fd5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:35:53 -0800 Subject: [PATCH 028/420] Refactor 'get_hex_digest' method and update pytests --- src/hashstore/filehashstore.py | 4 +--- tests/test_filehashstore_interface.py | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b69dffc0..7aa2f0c5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -702,11 +702,9 @@ def get_hex_digest(self, pid, algorithm): self._is_string_none_or_empty(pid, "pid", "get_hex_digest") self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") - # TODO: Find object from the pid reference file - entity = "objects" algorithm = self.clean_algorithm(algorithm) - object_cid = self.get_sha256_hex_digest(pid) + object_cid = self.find_object(pid) if not self.exists(entity, object_cid): exception_string = ( f"FileHashStore - get_hex_digest: No object found for pid: {pid}" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 1da70497..96c5ebcc 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -884,7 +884,8 @@ def test_get_hex_digest(store): path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) _metadata_cid = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" @@ -898,7 +899,7 @@ def test_get_hex_digest_pid_not_found(store): pid = "jtao.1700.1" pid_does_not_exist = pid + "test" algorithm = "sha256" - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): store.get_hex_digest(pid_does_not_exist, algorithm) From 81bb767f3134e6bfcf77cd0660b839c5238c714d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 09:42:22 -0800 Subject: [PATCH 029/420] Initial refactor to 'delete_object' to find correct object to delete and update pytests --- src/hashstore/filehashstore.py | 6 +----- tests/test_filehashstore_interface.py | 3 ++- tests/test_hashstore_client.py | 3 ++- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7aa2f0c5..ebfa0c8b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -662,12 +662,8 @@ def delete_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "delete_object") - # TODO: Also find the reference file and delete it if there's only one ref - # Else delete the pid in the cid refs file - # Also delete the pid ref file - entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) + object_cid = self.find_object(pid) self.delete(entity, object_cid) logging.info( diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 96c5ebcc..316d1676 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -803,7 +803,8 @@ def test_delete_objects(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store.count(entity) == 0 diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 999f26ad..7d1e01a0 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -189,7 +189,8 @@ def test_delete_objects(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) client_module_path = f"{client_directory}/client.py" test_store = store.root From ccaa768538968f0ad6d1243fad5061cdf231526e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 10:11:41 -0800 Subject: [PATCH 030/420] Add new method 'get_refs_abs_path' and refactor 'FileHashStore' and pytests --- src/hashstore/filehashstore.py | 38 ++++++++++------ tests/test_filehashstore.py | 83 ++++++---------------------------- 2 files changed, 39 insertions(+), 82 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ebfa0c8b..ec4316de 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -501,10 +501,7 @@ def tag_object(self, pid, cid): self.reference_locked_cids.append(cid) try: # Check to see if reference file already exists for the cid - entity = "refs" - cid_ref_abs_path = self.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line self.update_cid_refs(cid_ref_abs_path, pid) @@ -513,10 +510,7 @@ def tag_object(self, pid, cid): self.create_path(os.path.dirname(cid_ref_abs_path)) self.write_cid_refs_file(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content - pid_hash = self.computehash(pid, self.algorithm) - pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) self.create_path(os.path.dirname(pid_ref_abs_path)) self.write_pid_refs_file(pid_ref_abs_path, cid) finally: @@ -538,12 +532,7 @@ def find_object(self, pid): # TODO: Write tests for this method self._is_string_none_or_empty(pid, "pid", "find_object") - # Get the path to the pid reference by calculating its hash in '.../refs/pid' - entity = "refs" - pid_hash = self.computehash(pid, self.algorithm) - pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) if not os.path.exists(pid_ref_abs_path): err_msg = ( f"FileHashStore - find_object: pid ({pid}) reference file not found: " @@ -662,6 +651,10 @@ def delete_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "delete_object") + # Remove pid from cid reference file + # self.delete_cid_refs_pid(, pid) + # Delete cid reference file if it's empty + # Delete pid reference file entity = "objects" object_cid = self.find_object(pid) self.delete(entity, object_cid) @@ -1836,6 +1829,23 @@ def build_abs_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path + def get_refs_abs_path(self, ref_type, pid): + """Get the absolute path to the reference file for the given pid. + + Args: + ref_type (string): 'pid' or 'cid' + pid (string): Authority-based or persistent identifier + + Returns: + ref_file_abs_path (string): Path to the ref file for the given type and pid + """ + entity = "refs" + pid_hash = self.computehash(pid, self.algorithm) + ref_file_abs_path = self.build_abs_path(entity, pid_hash).replace( + "/refs/", f"/refs/{ref_type}/" + ) + return ref_file_abs_path + def count(self, entity): """Return count of the number of files in the `root` directory. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index eb63fcca..9e546f5b 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -546,11 +546,8 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) assert os.path.exists(cid_ref_abs_path) @@ -559,11 +556,8 @@ def test_write_cid_refs_file(pids, store): def test_write_cid_refs_file_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -576,11 +570,8 @@ def test_write_cid_refs_file_content(pids, store): def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -596,11 +587,8 @@ def test_update_cid_refs_content(pids, store): def test_update_cid_refs_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -622,11 +610,8 @@ def test_update_cid_refs_content_multiple(pids, store): def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -644,11 +629,8 @@ def test_delete_cid_refs_pid(pids, store): def test_delete_cid_refs_pid_pid_not_found(pids, store): """Test that delete_cid_refs_pid raises exception when pid not found.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -661,11 +643,8 @@ def test_delete_cid_refs_pid_pid_not_found(pids, store): def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_file deletes a reference file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) store.delete_cid_refs_pid(cid_ref_abs_path, pid) @@ -677,11 +656,8 @@ def test_delete_cid_refs_pid_file(pids, store): def test_delete_cid_refs_pid_file_not_empty(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not empty.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -692,12 +668,8 @@ def test_delete_cid_refs_pid_file_not_empty(pids, store): def test_delete_cid_refs_pid_file_not_found(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) - + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) with pytest.raises(FileNotFoundError): store.delete_cid_refs_file(cid_ref_abs_path) @@ -705,12 +677,8 @@ def test_delete_cid_refs_pid_file_not_found(pids, store): def test_write_pid_refs_file(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) assert os.path.exists(pid_ref_abs_path) @@ -719,12 +687,8 @@ def test_write_pid_refs_file(pids, store): def test_write_pid_refs_file_content(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) @@ -738,12 +702,8 @@ def test_write_pid_refs_file_exists(pids, store): """Test that write_pid_refs_file returns when ref already exists and the cid given is the same.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) # This should not write and return @@ -759,12 +719,8 @@ def test_write_pid_refs_file_exists_different_cid(pids, store): """Test that write_pid_refs_file returns when ref already exists and the cid given is the same.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) with pytest.raises(ValueError): @@ -774,12 +730,8 @@ def test_write_pid_refs_file_exists_different_cid(pids, store): def test_delete_pid_refs_file(pids, store): """Test that delete_pid_refs_file deletes a reference file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) store.delete_pid_refs_file(pid_ref_abs_path) @@ -790,12 +742,7 @@ def test_delete_pid_refs_file(pids, store): def test_delete_pid_refs_file_file_not_found(pids, store): """Test that delete_pid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): - entity = "refs" - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) - + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) with pytest.raises(FileNotFoundError): store.delete_cid_refs_file(pid_ref_abs_path) From 8d44f42eb39e5204a8fff6c6a31d0df585ba177a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 10:31:03 -0800 Subject: [PATCH 031/420] Delete redundant 'get_sha256_hex_digest' method and refactor FileHashStore class and pytests --- src/hashstore/filehashstore.py | 26 +++++++------------------- tests/test_filehashstore.py | 7 ------- 2 files changed, 7 insertions(+), 26 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ec4316de..9c9203ee 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -629,7 +629,7 @@ def retrieve_metadata(self, pid, format_id=None): checked_format_id = self._validate_format_id(format_id, "retrieve_metadata") entity = "metadata" - metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) + metadata_cid = self.computehash(pid + checked_format_id) metadata_exists = self.exists(entity, metadata_cid) if metadata_exists: metadata_stream = self.open(entity, metadata_cid) @@ -674,7 +674,7 @@ def delete_metadata(self, pid, format_id=None): checked_format_id = self._validate_format_id(format_id, "delete_metadata") entity = "metadata" - metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) + metadata_cid = self.computehash(pid + checked_format_id) self.delete(entity, metadata_cid) logging.info( @@ -1302,7 +1302,7 @@ def put_metadata(self, metadata, pid, format_id): metadata_tmp = self._mktmpmetadata(metadata_stream) # Get target and related paths (permanent location) - metadata_cid = self.get_sha256_hex_digest(pid + format_id) + metadata_cid = self.computehash(pid + format_id) rel_path = "/".join(self.shard(metadata_cid)) full_path = self.get_store_path("metadata") / rel_path @@ -1602,11 +1602,12 @@ def clean_algorithm(self, algorithm_string): return cleaned_string def computehash(self, stream, algorithm=None): - """Compute hash of a file-like object using :attr:`algorithm` by default - or with optional algorithm supported. + """Compute the hash of a file-like object (or string) using :attr:`algorithm` by + default or with optional algorithm supported. Args: - stream (io.BufferedReader): A buffered stream of an object_cid object. \n + stream (mixed): A buffered stream (io.BufferedReader) of an object. A string is + also acceptable as they are a sequence of characters (Python only).\n algorithm (string): Algorithm of hex digest to generate. Returns: @@ -1926,19 +1927,6 @@ def _to_bytes(text): text = bytes(text, "utf8") return text - @staticmethod - def get_sha256_hex_digest(string): - """Calculate the SHA-256 digest of a UTF-8 encoded string. - - Args: - string (string): String to convert. - - Returns: - hex (string): Hexadecimal string. - """ - hex_digest = hashlib.sha256(string.encode("utf-8")).hexdigest() - return hex_digest - class Stream(object): """Common interface for file-like objects. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 9e546f5b..526444cb 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1090,10 +1090,3 @@ def test_to_bytes(store): # pylint: disable=W0212 string_bytes = store._to_bytes(string) assert isinstance(string_bytes, bytes) - - -def test_get_sha256_hex_digest(pids, store): - """Test for correct sha256 return value.""" - for pid in pids: - hash_val = store.get_sha256_hex_digest(pid) - assert hash_val == pids[pid]["object_cid"] From 1a921a0937d0786130f14687470227a4c2c8f8ad Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 10:59:10 -0800 Subject: [PATCH 032/420] Refactor 'delete_object' to delete all required pid or cid reference files --- src/hashstore/filehashstore.py | 36 ++++++++++++++++++++++------------ tests/test_filehashstore.py | 11 ++++++----- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9c9203ee..bf52019e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -652,12 +652,19 @@ def delete_object(self, pid): self._is_string_none_or_empty(pid, "pid", "delete_object") # Remove pid from cid reference file - # self.delete_cid_refs_pid(, pid) - # Delete cid reference file if it's empty + cid = self.find_object(pid) + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + self.delete_cid_refs_pid(cid_ref_abs_path, pid) + # Delete cid reference file + # If the file is not empty, it will not be deleted. + cid_refs_deleted = self.delete_cid_refs_file(cid_ref_abs_path) # Delete pid reference file - entity = "objects" - object_cid = self.find_object(pid) - self.delete(entity, object_cid) + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + self.delete_pid_refs_file(pid_ref_abs_path) + # Finally, delete the object + if cid_refs_deleted: + entity = "objects" + self.delete(entity, cid) logging.info( "FileHashStore - delete_object: Successfully deleted object for pid: %s", @@ -1173,6 +1180,9 @@ def delete_cid_refs_file(self, cid_ref_abs_path): Args: cid_ref_abs_path (string): Absolute path to the cid ref file + + Returns: + boolean: True if deleted, False if not """ info_msg = ( "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", @@ -1188,14 +1198,15 @@ def delete_cid_refs_file(self, cid_ref_abs_path): ) raise FileNotFoundError(err_msg) if os.path.getsize(cid_ref_abs_path) != 0: - err_msg = ( + warn_msg = ( "FileHashStore - delete_cid_refs_file: Failed to delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " ) - raise OSError(err_msg) + logging.warning(warn_msg) + return False else: os.remove(cid_ref_abs_path) - return + return True except Exception as err: exception_string = ( @@ -1830,19 +1841,20 @@ def build_abs_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path - def get_refs_abs_path(self, ref_type, pid): + def get_refs_abs_path(self, ref_type, hash_id): """Get the absolute path to the reference file for the given pid. Args: ref_type (string): 'pid' or 'cid' - pid (string): Authority-based or persistent identifier + hash_id (string): Authority-based, persistent or hash identifier Returns: ref_file_abs_path (string): Path to the ref file for the given type and pid """ entity = "refs" - pid_hash = self.computehash(pid, self.algorithm) - ref_file_abs_path = self.build_abs_path(entity, pid_hash).replace( + if ref_type is "pid": + hash_id = self.computehash(hash_id, self.algorithm) + ref_file_abs_path = self.build_abs_path(entity, hash_id).replace( "/refs/", f"/refs/{ref_type}/" ) return ref_file_abs_path diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 526444cb..83cd1e62 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -648,21 +648,22 @@ def test_delete_cid_refs_pid_file(pids, store): store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) store.delete_cid_refs_pid(cid_ref_abs_path, pid) - store.delete_cid_refs_file(cid_ref_abs_path) + cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + assert cid_refs_deleted assert not os.path.exists(cid_ref_abs_path) def test_delete_cid_refs_pid_file_not_empty(pids, store): - """Test that delete_cid_refs_file raises an exception when refs file not empty.""" + """Test that delete_cid_refs_file does not raise an exception when refs file + is not empty.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) - - with pytest.raises(OSError): - store.delete_cid_refs_file(cid_ref_abs_path) + cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + assert not cid_refs_deleted def test_delete_cid_refs_pid_file_not_found(pids, store): From 4fee82323e0a02ded076dddf58a72c87943777fa Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 11:44:39 -0800 Subject: [PATCH 033/420] Synchronized 'delete_object' method with 'tag_object' method on cid value --- src/hashstore/filehashstore.py | 71 ++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bf52019e..795b1f28 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,17 +478,11 @@ def store_object( return object_metadata def tag_object(self, pid, cid): - """Tag an object that has been stored with a pid reference. - - Args: - pid (string): Authority-based or persistent identifier of object - cid (string): Content identifier - """ # TODO: Write tests for this method # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( - "FileHashStore - tag_object: (cid) %s is currently being tagged. Waiting.", + "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", cid, ) time.sleep(self.time_out_sec) @@ -650,27 +644,49 @@ def delete_object(self, pid): "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) self._is_string_none_or_empty(pid, "pid", "delete_object") - - # Remove pid from cid reference file cid = self.find_object(pid) - cid_ref_abs_path = self.get_refs_abs_path("cid", cid) - self.delete_cid_refs_pid(cid_ref_abs_path, pid) - # Delete cid reference file - # If the file is not empty, it will not be deleted. - cid_refs_deleted = self.delete_cid_refs_file(cid_ref_abs_path) - # Delete pid reference file - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) - self.delete_pid_refs_file(pid_ref_abs_path) - # Finally, delete the object - if cid_refs_deleted: - entity = "objects" - self.delete(entity, cid) - logging.info( - "FileHashStore - delete_object: Successfully deleted object for pid: %s", - pid, - ) - return True + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(cid) + try: + # Remove pid from cid reference file + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + self.delete_cid_refs_pid(cid_ref_abs_path, pid) + # Delete cid reference file + # If the file is not empty, it will not be deleted. + cid_refs_deleted = self.delete_cid_refs_file(cid_ref_abs_path) + # Delete pid reference file + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + self.delete_pid_refs_file(pid_ref_abs_path) + # Finally, delete the object + if cid_refs_deleted: + entity = "objects" + self.delete(entity, cid) + return True + finally: + # Release cid + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Removing cid: %s from reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) + info_msg = ( + "FileHashStore - delete_object: Successfully deleted references and/or" + + f" objects associated with pid: {pid}" + ) + logging.info(info_msg) def delete_metadata(self, pid, format_id=None): logging.debug( @@ -1063,7 +1079,7 @@ def delete_tmp_file(): def write_cid_refs_file(self, cid_ref_abs_path, pid): """Write the reference file for the given content identifier (cid). A reference - file contains every pid that references a cid on a new line. + file contains every pid that references a cid each on its own line. Args: cid_ref_abs_path (string): Absolute path to the cid ref file @@ -1164,7 +1180,6 @@ def delete_cid_refs_pid(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - return except Exception as err: From 6316044bc8e0e7930533827f80bfa98afcd8b639 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 11:55:23 -0800 Subject: [PATCH 034/420] Add new pytests for 'delete_object' --- tests/test_filehashstore_interface.py | 54 ++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 316d1676..dda3b4c3 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -795,7 +795,7 @@ def test_retrieve_metadata_format_id_empty_spaces(store): def test_delete_objects(pids, store): - """Test delete_object successfully deletes objects.""" + """Test delete_object successfully deletes objects from /objects.""" test_dir = "tests/testdata/" entity = "objects" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -810,6 +810,58 @@ def test_delete_objects(pids, store): assert store.count(entity) == 0 +def test_delete_objects_pid_refs_file(pids, store): + """Test delete_object deletes the pid refs file containing the cid.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert not os.path.exists(pid_refs_file_path) + + +def test_delete_objects_cid_refs_file(pids, store): + """Test delete_object deletes the cid refs file containing the cid.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert not os.path.exists(cid_refs_file_path) + + +def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): + """Test delete_object does not delete the cid refs file that still contains ref.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + cid_refs_abs_path = store.get_refs_abs_path("cid", cid) + store.update_cid_refs(cid_refs_abs_path, "dou.test.1") + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + def test_delete_object_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" pid = " " From e5b60aea718fae3f7e9dd938521e5fbc67b56be2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 14:03:05 -0800 Subject: [PATCH 035/420] Add pytests for 'find_object' method --- src/hashstore/filehashstore.py | 1 - tests/test_filehashstore_interface.py | 29 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 795b1f28..0753572b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -523,7 +523,6 @@ def find_object(self, pid): logging.debug( "FileHashStore - find_object: Request to find object for for pid: %s", pid ) - # TODO: Write tests for this method self._is_string_none_or_empty(pid, "pid", "find_object") pid_ref_abs_path = self.get_refs_abs_path("pid", pid) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index dda3b4c3..60718839 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -533,6 +533,35 @@ def test_store_object_sparse_large_file(store): assert object_metadata_id == object_metadata.hex_digests.get("sha256") +def test_find_object(pids, store): + """Test find object returns the correct content identifier (cid).""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + cid = store.find_object(pid) + assert cid == object_metadata.hex_digests.get("sha256") + + +def test_find_object_pid_object_does_not_exist(store): + """Test find object throws exception when object doesn't exist.""" + with pytest.raises(FileNotFoundError): + store.find_object("dou.test.1") + + +def test_find_object_pid_none(store): + """Test find object throws exception when pid is None.""" + with pytest.raises(ValueError): + store.find_object(None) + + +def test_find_object_pid_empty(store): + """Test find object throws exception when pid is empty.""" + with pytest.raises(ValueError): + store.find_object("") + + def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" From 7242a62e22595fa8deca603402711ca8fa8ee9fe Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 14:09:33 -0800 Subject: [PATCH 036/420] Clean up 'filehashstore_interface' pytests --- tests/test_filehashstore_interface.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 60718839..af957c8d 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -34,13 +34,9 @@ def test_store_object(pids, store): """Test store object.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) assert object_metadata.id == pids[pid][store.algorithm] assert store.count(entity) == 3 @@ -49,13 +45,9 @@ def test_store_object_files_path(pids, store): """Test store object when given a path.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -64,13 +56,9 @@ def test_store_object_files_string(pids, store): """Test store object when given a string.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path_string) - _metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -567,10 +555,8 @@ def test_store_metadata(pids, store): test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert metadata_cid == pids[pid]["metadata_cid"] @@ -579,10 +565,8 @@ def test_store_metadata_default_format_id(pids, store): """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath) assert metadata_cid == pids[pid]["metadata_cid"] @@ -593,10 +577,8 @@ def test_store_metadata_files_path(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, metadata_cid) assert metadata_cid == pids[pid]["metadata_cid"] @@ -609,10 +591,8 @@ def test_store_metadata_files_string(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) - _object_metadata = store.store_object(pid, path_string) metadata_cid = store.store_metadata(pid, syspath_string, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -624,8 +604,6 @@ def test_store_metadata_files_input_stream(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") From 180d9710c48614f9ae464b8bd213562d56da33a9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 14:59:44 -0800 Subject: [PATCH 037/420] Add new pytests for 'tag_object' method --- src/hashstore/filehashstore.py | 24 ++++++--- tests/test_filehashstore_interface.py | 75 +++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 0753572b..5f61a11a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,7 +478,13 @@ def store_object( return object_metadata def tag_object(self, pid, cid): - # TODO: Write tests for this method + logging.debug( + "FileHashStore - tag_object: Tagging object cid: {%s} with pid: {%s}.", + cid, + pid, + ) + self._is_string_none_or_empty(pid, "pid", "tag_object") + self._is_string_none_or_empty(cid, "cid", "tag_object") # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -494,19 +500,21 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - # Check to see if reference file already exists for the cid + # TODO: Review process and test what happens when specific pieces fail + # We cannot have a pid ref file whose pid is not referenced in the cid refs file cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line self.update_cid_refs(cid_ref_abs_path, pid) else: - # If not, create the cid ref file in '.../refs/cid' and write the pid - self.create_path(os.path.dirname(cid_ref_abs_path)) - self.write_cid_refs_file(cid_ref_abs_path, pid) - # Then create the pid ref file in '.../refs/pid' with the cid as its content + # If not, create the pid ref file in '.../refs/pid' with the cid as its content pid_ref_abs_path = self.get_refs_abs_path("pid", pid) self.create_path(os.path.dirname(pid_ref_abs_path)) self.write_pid_refs_file(pid_ref_abs_path, cid) + # Then create the cid ref file in '.../refs/cid' and write the pid + self.create_path(os.path.dirname(cid_ref_abs_path)) + self.write_cid_refs_file(cid_ref_abs_path, pid) + return True finally: # Release cid with self.reference_lock: @@ -517,7 +525,6 @@ def tag_object(self, pid, cid): self.reference_locked_cids.remove(cid) info_msg = f"FileHashStore - tag_object: Successfully tagged cid: {cid} with pid: {pid}" logging.info(info_msg) - return def find_object(self, pid): logging.debug( @@ -843,7 +850,7 @@ def store_data_only(self, data): + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - raise IOError(exception_string) from err + raise err def _move_and_get_checksums( self, @@ -1521,6 +1528,7 @@ def _validate_object( tmp_file_size: Size of the tmp file file_size_to_validate: Expected size of the object """ + # TODO: Refactor this method and/or create a new method for Metacat client to call if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: self.delete(entity, tmp_file_name) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index af957c8d..40af07f5 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -521,6 +521,81 @@ def test_store_object_sparse_large_file(store): assert object_metadata_id == object_metadata.hex_digests.get("sha256") +def test_tag_object(pids, store): + """Test tag object returns boolean.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + object_tagged = store.tag_object(pid, object_metadata.id) + assert object_tagged + + +def test_tag_object_pid_refs_file(pids, store): + """Test tag object creates the pid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + + +def test_tag_object_pid_refs_file_content(pids, store): + """Test tag object creates the pid reference file contains the correct cid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + with open(pid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + assert pid_refs_cid == object_metadata.id + + +def test_tag_object_cid_refs_file(pids, store): + """Test tag object creates the cid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + +def test_tag_object_cid_refs_file_content(pids, store): + """Test tag object tags cid reference file successfully with pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) + with open(cid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read().strip() + assert pid_refs_cid == pid + + +def test_tag_object_with_existing_cid_refs_file(pids, store): + """Test tag object raises exception when trying to add another cid to an + existing pid reference file and that a cid reference file is not created.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + another_cid = "dou.test.1" + with pytest.raises(ValueError): + store.tag_object(pid, another_cid) + + second_cid_hash = store.get_refs_abs_path("cid", another_cid) + assert not os.path.exists(second_cid_hash) + + def test_find_object(pids, store): """Test find object returns the correct content identifier (cid).""" test_dir = "tests/testdata/" From 636eeffdcec49cfdfaa90f715b3de99d221d4358 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 09:34:54 -0800 Subject: [PATCH 038/420] Rename '_mktmpfile' method to '_write_to_tmp_file_and_get_hex_digests' and update pytests --- src/hashstore/filehashstore.py | 10 ++++-- tests/test_filehashstore.py | 58 +++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 27 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5f61a11a..3d7c26d1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -894,7 +894,11 @@ def _move_and_get_checksums( + f" file and calculating checksums for pid: {pid}" ) logging.debug(debug_msg) - hex_digests, tmp_file_name, tmp_file_size = self._mktmpfile( + ( + hex_digests, + tmp_file_name, + tmp_file_size, + ) = self._write_to_tmp_file_and_get_hex_digests( stream, additional_algorithm, checksum_algorithm ) logging.debug( @@ -981,7 +985,9 @@ def _move_and_get_checksums( return (object_cid, tmp_file_size, hex_digests) - def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None): + def _write_to_tmp_file_and_get_hex_digests( + self, stream, additional_algorithm=None, checksum_algorithm=None + ): """Create a named temporary file from a `Stream` object and return its filename and a dictionary of its algorithms and hex digests. If an additionak and/or checksum algorithm is provided, it will add the respective hex digest to the dictionary. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 83cd1e62..6bfd6736 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -409,8 +409,8 @@ def test_move_and_get_checksums_file_size_raises_error(pids, store): input_stream.close() -def test_mktempfile_additional_algo(store): - """Test _mktempfile returns correct hex digests for additional algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): + """Test _write...hex_digests returns correct hex digests for additional algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -420,15 +420,15 @@ def test_mktempfile_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=checksum_algo ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct -def test_mktempfile_checksum_algo(store): - """Test _mktempfile returns correct hex digests for checksum algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): + """Test _write...hex_digests returns correct hex digests for checksum algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -438,13 +438,15 @@ def test_mktempfile_checksum_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile(input_stream, checksum_algorithm=checksum_algo) + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, checksum_algorithm=checksum_algo + ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct -def test_mktempfile_checksum_and_additional_algo(store): - """Test _mktempfile returns correct hex digests for checksum algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(store): + """Test _write...hex_digests returns correct hex digests for checksum algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -458,7 +460,7 @@ def test_mktempfile_checksum_and_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -468,8 +470,10 @@ def test_mktempfile_checksum_and_additional_algo(store): assert hex_digests.get("sha224") == additional_algo_checksum -def test_mktempfile_checksum_and_additional_algo_duplicate(store): - """Test _mktempfile succeeds with duplicate algorithms (de-duplicates).""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo_duplicate( + store, +): + """Test _write...hex_digests succeeds with duplicate algorithms (de-duplicates).""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -478,7 +482,7 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): checksum_algo = "sha224" checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -487,26 +491,26 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): assert hex_digests.get("sha224") == checksum_correct -def test_mktempfile_file_size(pids, store): - """Test _mktempfile returns correct file size.""" +def test_write_to_tmp_file_and_get_hex_digests_file_size(pids, store): + """Test _write...hex_digests returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, _, tmp_file_size = store._mktmpfile(input_stream) + _, _, tmp_file_size = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert tmp_file_size == pids[pid]["file_size_bytes"] -def test_mktempfile_hex_digests(pids, store): - """Test _mktempfile returns correct hex digests.""" +def test_write_to_tmp_file_and_get_hex_digests_hex_digests(pids, store): + """Test _write...hex_digests returns correct hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile(input_stream) + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert hex_digests.get("md5") == pids[pid]["md5"] assert hex_digests.get("sha1") == pids[pid]["sha1"] @@ -515,20 +519,20 @@ def test_mktempfile_hex_digests(pids, store): assert hex_digests.get("sha512") == pids[pid]["sha512"] -def test_mktempfile_tmpfile_object(pids, store): - """Test _mktempfile creates file successfully.""" +def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): + """Test _write...hex_digests creates file successfully.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, tmp_file_name, _ = store._mktmpfile(input_stream) + _, tmp_file_name, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert os.path.isfile(tmp_file_name) is True -def test_mktempfile_with_unsupported_algorithm(pids, store): - """Test _mktempfile raises error when bad algorithm supplied.""" +def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, store): + """Test _write...hex_digests raises error when bad algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -536,10 +540,14 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): algo = "md2" with pytest.raises(ValueError): # pylint: disable=W0212 - _, _, _ = store._mktmpfile(input_stream, additional_algorithm=algo) + _, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, additional_algorithm=algo + ) with pytest.raises(ValueError): # pylint: disable=W0212 - _, _, _ = store._mktmpfile(input_stream, checksum_algorithm=algo) + _, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, checksum_algorithm=algo + ) input_stream.close() From ee790ebfe91f59f5814b0552c09abe15ed67e7c5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 09:43:31 -0800 Subject: [PATCH 039/420] Extract new method '_mktmpfile' from '_write_to_tmp_file_and_get_hex_digests' and add new pytest --- src/hashstore/filehashstore.py | 64 ++++++++++++++++++++++------------ tests/test_filehashstore.py | 9 +++++ 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3d7c26d1..6109c4eb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1011,26 +1011,11 @@ def _write_to_tmp_file_and_get_hex_digests( # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) - tmp = NamedTemporaryFile(dir=tmp_root_path, delete=False) - - # Delete tmp file if python interpreter crashes or thread is interrupted - # when store_object is called - def delete_tmp_file(): - if os.path.exists(tmp.name): - os.remove(tmp.name) - - atexit.register(delete_tmp_file) - - # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) - try: - os.chmod(tmp.name, self.fmode) - finally: - os.umask(oldmask) + tmp = self._mktmpfile(tmp_root_path) logging.debug( - "FileHashStore - _mktempfile: tmp file created: %s, calculating hex digests.", + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: tmp file created:" + + " %s, calculating hex digests.", tmp.name, ) @@ -1047,7 +1032,8 @@ def delete_tmp_file(): for hash_algorithm in hash_algorithms: hash_algorithm.update(self._to_bytes(data)) logging.debug( - "FileHashStore - _mktempfile: Object stream successfully written to tmp file: %s", + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Object stream" + + " successfully written to tmp file: %s", tmp.name, ) @@ -1059,19 +1045,23 @@ def delete_tmp_file(): # Ready for validation and atomic move tmp_file_completion_flag = True - logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") + logging.debug( + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Hex digests calculated." + ) return hex_digest_dict, tmp.name, tmp_file_size # pylint: disable=W0718 except Exception as err: exception_string = ( - f"FileHashStore - _mktempfile: Unexpected {err=}, {type(err)=}" + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) # pylint: disable=W0707,W0719 raise Exception(exception_string) except KeyboardInterrupt: exception_string = ( - "FileHashStore - _mktempfile: Keyboard interruption by user." + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + " Keyboard interruption by user." ) logging.error(exception_string) if os.path.exists(tmp.name): @@ -1084,11 +1074,39 @@ def delete_tmp_file(): # pylint: disable=W0718 except Exception as err: exception_string = ( - f"FileHashStore - _mktempfile: Unexpected {err=} while attempting to" + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + f"Unexpected {err=} while attempting to" + f" delete tmp file: {tmp.name}, {type(err)=}" ) logging.error(exception_string) + def _mktmpfile(self, path): + """Create a temporary file at the given path ready to be written. + + Args: + path (string): Path to the file location + + Returns: + tmp (file object): object with file-like interface + """ + tmp = NamedTemporaryFile(dir=path, delete=False) + + # Delete tmp file if python interpreter crashes or thread is interrupted + def delete_tmp_file(): + if os.path.exists(tmp.name): + os.remove(tmp.name) + + atexit.register(delete_tmp_file) + + # Ensure tmp file is created with desired permissions + if self.fmode is not None: + oldmask = os.umask(0) + try: + os.chmod(tmp.name, self.fmode) + finally: + os.umask(oldmask) + return tmp + def write_cid_refs_file(self, cid_ref_abs_path, pid): """Write the reference file for the given content identifier (cid). A reference file contains every pid that references a cid each on its own line. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 6bfd6736..1402a24d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -551,6 +551,15 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, input_stream.close() +def test_mktmpfile(store): + """Test that _mktmpfile creates and returns a tmp file.""" + path = store.root + "/doutest/tmp/" + store.create_path(path) + # pylint: disable=W0212 + tmp = store._mktmpfile(path) + assert os.path.exists(tmp.name) + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): From 524947c169e96f006421ff38f38f3ff770ae2b8f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 09:44:44 -0800 Subject: [PATCH 040/420] Refactor '_mktmpmetadata' method --- src/hashstore/filehashstore.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6109c4eb..14ecc46c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1411,14 +1411,7 @@ def _mktmpmetadata(self, stream): if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) - tmp = NamedTemporaryFile(dir=tmp_root_path, delete=False) - # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) - try: - os.chmod(tmp.name, self.fmode) - finally: - os.umask(oldmask) + tmp = self._mktmpfile(tmp_root_path) # tmp is a file-like object that is already opened for writing by default logging.debug( From d77a10e52f0780cbe3b078d00925ed998aad19a5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 11:58:28 -0800 Subject: [PATCH 041/420] Refactor 'write_pid_refs_file' to throw exception immediately if refs file for given pid exists. --- src/hashstore/filehashstore.py | 18 +++++++----------- tests/test_filehashstore.py | 23 +++-------------------- tests/test_filehashstore_interface.py | 2 +- 3 files changed, 11 insertions(+), 32 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 14ecc46c..f6e65168 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -516,6 +516,7 @@ def tag_object(self, pid, cid): self.write_cid_refs_file(cid_ref_abs_path, pid) return True finally: + # TODO: Verify that the reference files have been written as expected. # Release cid with self.reference_lock: logging.debug( @@ -1277,17 +1278,12 @@ def write_pid_refs_file(self, pid_ref_abs_path, cid): logging.info(info_msg) if os.path.exists(pid_ref_abs_path): - with open(pid_ref_abs_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - if pid_refs_cid == cid: - return - else: - exception_string = ( - "FileHashStore - write_pid_refs_file: pid reference file exists but" - + f" cid ({cid}) is different from cid stored ({pid_refs_cid})." - ) - logging.error(exception_string) - raise ValueError(exception_string) + exception_string = ( + "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", + pid_ref_abs_path, + ) + logging.error(exception_string) + raise FileExistsError(exception_string) else: try: with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1402a24d..7eeecaa4 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -717,32 +717,15 @@ def test_write_pid_refs_file_content(pids, store): def test_write_pid_refs_file_exists(pids, store): - """Test that write_pid_refs_file returns when ref already exists and the - cid given is the same.""" + """Test that write_pid_refs_file throws exception if ref file already exists.""" for pid in pids.keys(): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) # This should not write and return - store.write_pid_refs_file(pid_ref_abs_path, cid) - - with open(pid_ref_abs_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - - assert cid == pid_refs_cid - - -def test_write_pid_refs_file_exists_different_cid(pids, store): - """Test that write_pid_refs_file returns when ref already exists and the - cid given is the same.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - with pytest.raises(ValueError): - store.write_pid_refs_file(pid_ref_abs_path, "abc123") + with pytest.raises(FileExistsError): + store.write_pid_refs_file(pid_ref_abs_path, cid) def test_delete_pid_refs_file(pids, store): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 40af07f5..94097d3a 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -589,7 +589,7 @@ def test_tag_object_with_existing_cid_refs_file(pids, store): object_metadata = store.store_object(pid, path) store.tag_object(pid, object_metadata.id) another_cid = "dou.test.1" - with pytest.raises(ValueError): + with pytest.raises(FileExistsError): store.tag_object(pid, another_cid) second_cid_hash = store.get_refs_abs_path("cid", another_cid) From 562cf79195d0411f5e6ae37eca1642be1c79eb77 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 13:08:34 -0800 Subject: [PATCH 042/420] Refactor 'tag_object' process and related methods and fix bug in 'update_cid_refs()' --- src/hashstore/filehashstore.py | 73 +++++++++++++++++----------------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f6e65168..d280e594 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -502,13 +502,20 @@ def tag_object(self, pid, cid): try: # TODO: Review process and test what happens when specific pieces fail # We cannot have a pid ref file whose pid is not referenced in the cid refs file + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) - if os.path.exists(cid_ref_abs_path): + if os.path.exists(pid_ref_abs_path): + exception_string = ( + "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", + pid_ref_abs_path, + ) + logging.error(exception_string) + raise FileExistsError(exception_string) + elif os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line self.update_cid_refs(cid_ref_abs_path, pid) else: # If not, create the pid ref file in '.../refs/pid' with the cid as its content - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) self.create_path(os.path.dirname(pid_ref_abs_path)) self.write_pid_refs_file(pid_ref_abs_path, cid) # Then create the cid ref file in '.../refs/cid' and write the pid @@ -1153,19 +1160,19 @@ def update_cid_refs(self, cid_ref_abs_path, pid): logging.info(info_msg) try: + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + if pid == value: + err_msg = ( + f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) - # Read the ref file to see if the pid is already referencing the cid - cid_ref_file_content = cid_ref_file.read() - - if pid in cid_ref_file_content: - err_msg = ( - f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" - + f" cid reference file: {cid_ref_abs_path} " - ) - raise ValueError(err_msg) - else: - cid_ref_file.write(pid + "\n") + cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) @@ -1262,45 +1269,37 @@ def delete_cid_refs_file(self, cid_ref_abs_path): logging.error(exception_string) raise err - def write_pid_refs_file(self, pid_ref_abs_path, cid): + def write_pid_refs_file(self, path, cid): """Write the reference file for the given pid (persistent identifier). A reference file for a pid contains the cid that it references. Its permanent address is the pid hash with HashStore's default store algorithm and follows its directory structure. Args: - pid_ref_abs_path (string): Absolute path to the pid ref file + path (string): Path to file to be written into cid (string): Content identifier """ info_msg = ( f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into pid reference" - + f" file: {pid_ref_abs_path}" + + f" file: {path}" ) logging.info(info_msg) - if os.path.exists(pid_ref_abs_path): + try: + with open(path, "w", encoding="utf8") as pid_ref_file: + fcntl.flock(pid_ref_file, fcntl.LOCK_EX) + pid_ref_file.write(cid) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + + except Exception as err: exception_string = ( - "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", - pid_ref_abs_path, + "FileHashStore - write_pid_refs_file: failed to write pid reference file:" + + f" {path} for cid: {cid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - raise FileExistsError(exception_string) - else: - try: - with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: - fcntl.flock(pid_ref_file, fcntl.LOCK_EX) - pid_ref_file.write(cid) - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) - return - - except Exception as err: - exception_string = ( - "FileHashStore - write_pid_refs_file: failed to write pid reference file:" - + f" {pid_ref_abs_path} for cid: {cid}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err + raise err def delete_pid_refs_file(self, pid_ref_abs_path): """Delete a pid reference file. From 59df239e9144b10455b5660b158c898787aa2fa2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 13:18:35 -0800 Subject: [PATCH 043/420] Revise pytests, and extract pytests for references related processes into its own test module 'test_filehashstore_references' --- tests/test_filehashstore.py | 188 ------------------------ tests/test_filehashstore_interface.py | 18 ++- tests/test_filehashstore_references.py | 190 +++++++++++++++++++++++++ 3 files changed, 207 insertions(+), 189 deletions(-) create mode 100644 tests/test_filehashstore_references.py diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 7eeecaa4..1b0116fa 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -560,194 +560,6 @@ def test_mktmpfile(store): assert os.path.exists(tmp.name) -def test_write_cid_refs_file(pids, store): - """Test that write_cid_reference writes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - assert os.path.exists(cid_ref_abs_path) - - -def test_write_cid_refs_file_content(pids, store): - """Test that write_cid_ref_file writes the expected content.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - cid_ref_file_pid = f.read() - - assert pid == cid_ref_file_pid.replace("\n", "") - - -def test_update_cid_refs_content(pids, store): - """Test that update_cid_ref updates the ref file as expected.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) - - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - assert value == pid or value == pid_other - - -def test_update_cid_refs_content_multiple(pids, store): - """Test that update_cid_refs adds multiple references successfully.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - cid_reference_list = [pid] - for i in range(0, 5): - store.update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") - cid_reference_list.append(f"dou.test.{i}") - - line_count = 0 - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - line_count += 1 - value = line.strip() - assert value in cid_reference_list - - assert line_count == 6 - - -def test_delete_cid_refs_pid(pids, store): - """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) - store.delete_cid_refs_pid(cid_ref_abs_path, pid) - - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - print(value) - assert value == pid_other - - -def test_delete_cid_refs_pid_pid_not_found(pids, store): - """Test that delete_cid_refs_pid raises exception when pid not found.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) - with pytest.raises(ValueError): - store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") - - -def test_delete_cid_refs_pid_file(pids, store): - """Test that delete_cid_refs_file deletes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - store.delete_cid_refs_pid(cid_ref_abs_path, pid) - cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) - - assert cid_refs_deleted - assert not os.path.exists(cid_ref_abs_path) - - -def test_delete_cid_refs_pid_file_not_empty(pids, store): - """Test that delete_cid_refs_file does not raise an exception when refs file - is not empty.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) - assert not cid_refs_deleted - - -def test_delete_cid_refs_pid_file_not_found(pids, store): - """Test that delete_cid_refs_file raises an exception when refs file not found.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - with pytest.raises(FileNotFoundError): - store.delete_cid_refs_file(cid_ref_abs_path) - - -def test_write_pid_refs_file(pids, store): - """Test that write_pid_refs_file writes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - assert os.path.exists(pid_ref_abs_path) - - -def test_write_pid_refs_file_content(pids, store): - """Test that write_pid_refs_file writes the expected content.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - - with open(pid_ref_abs_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - - assert cid == pid_refs_cid - - -def test_write_pid_refs_file_exists(pids, store): - """Test that write_pid_refs_file throws exception if ref file already exists.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - # This should not write and return - with pytest.raises(FileExistsError): - store.write_pid_refs_file(pid_ref_abs_path, cid) - - -def test_delete_pid_refs_file(pids, store): - """Test that delete_pid_refs_file deletes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - store.delete_pid_refs_file(pid_ref_abs_path) - - assert not os.path.exists(pid_ref_abs_path) - - -def test_delete_pid_refs_file_file_not_found(pids, store): - """Test that delete_pid_refs_file raises an exception when refs file not found.""" - for pid in pids.keys(): - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - with pytest.raises(FileNotFoundError): - store.delete_cid_refs_file(pid_ref_abs_path) - - def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 94097d3a..e7ead830 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -542,6 +542,22 @@ def test_tag_object_pid_refs_file(pids, store): assert os.path.exists(pid_refs_file_path) +def test_tag_object_pid_refs_file_exists(pids, store): + """Test tag object throws exception when pid refs file already exists.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + with pytest.raises(FileExistsError): + store.tag_object(pid, cid) + + def test_tag_object_pid_refs_file_content(pids, store): """Test tag object creates the pid reference file contains the correct cid.""" test_dir = "tests/testdata/" @@ -580,7 +596,7 @@ def test_tag_object_cid_refs_file_content(pids, store): assert pid_refs_cid == pid -def test_tag_object_with_existing_cid_refs_file(pids, store): +def test_tag_object_cid_refs_file_exists(pids, store): """Test tag object raises exception when trying to add another cid to an existing pid reference file and that a cid reference file is not created.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py new file mode 100644 index 00000000..63fc54da --- /dev/null +++ b/tests/test_filehashstore_references.py @@ -0,0 +1,190 @@ +"""Test module for FileHashStore core, utility and supporting methods""" +import os +import pytest + + +def test_write_cid_refs_file(pids, store): + """Test that write_cid_reference writes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + assert os.path.exists(cid_ref_abs_path) + + +def test_write_cid_refs_file_content(pids, store): + """Test that write_cid_ref_file writes the expected content.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert pid == cid_ref_file_pid.strip() + + +def test_update_cid_refs_content(pids, store): + """Test that update_cid_ref updates the ref file as expected.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_refs(cid_ref_abs_path, pid_other) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == pid_other + + +def test_update_cid_refs_content_multiple(pids, store): + """Test that update_cid_refs adds multiple references successfully.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + cid_reference_list = [pid] + for i in range(0, 5): + store.update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + cid_reference_list.append(f"dou.test.{i}") + + line_count = 0 + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + line_count += 1 + value = line.strip() + assert value in cid_reference_list + + assert line_count == 6 + + +def test_update_cid_refs_content_pid_exists(pids, store): + """Test that update_cid_ref does not write pid if pid already exists""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + with pytest.raises(ValueError): + store.update_cid_refs(cid_ref_abs_path, pid) + + +def test_delete_cid_refs_pid(pids, store): + """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_refs(cid_ref_abs_path, pid_other) + store.delete_cid_refs_pid(cid_ref_abs_path, pid) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + print(value) + assert value == pid_other + + +def test_delete_cid_refs_pid_pid_not_found(pids, store): + """Test that delete_cid_refs_pid raises exception when pid not found.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_refs(cid_ref_abs_path, pid_other) + with pytest.raises(ValueError): + store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") + + +def test_delete_cid_refs_pid_file(pids, store): + """Test that delete_cid_refs_file deletes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + store.delete_cid_refs_pid(cid_ref_abs_path, pid) + cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + + assert cid_refs_deleted + assert not os.path.exists(cid_ref_abs_path) + + +def test_delete_cid_refs_pid_file_not_empty(pids, store): + """Test that delete_cid_refs_file does not raise an exception when refs file + is not empty.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + assert not cid_refs_deleted + + +def test_delete_cid_refs_pid_file_not_found(pids, store): + """Test that delete_cid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with pytest.raises(FileNotFoundError): + store.delete_cid_refs_file(cid_ref_abs_path) + + +def test_write_pid_refs_file(pids, store): + """Test that write_pid_refs_file writes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + assert os.path.exists(pid_ref_abs_path) + + +def test_write_pid_refs_file_content(pids, store): + """Test that write_pid_refs_file writes the expected content.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + assert cid == pid_refs_cid + + +def test_delete_pid_refs_file(pids, store): + """Test that delete_pid_refs_file deletes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + store.delete_pid_refs_file(pid_ref_abs_path) + + assert not os.path.exists(pid_ref_abs_path) + + +def test_delete_pid_refs_file_file_not_found(pids, store): + """Test that delete_pid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + with pytest.raises(FileNotFoundError): + store.delete_cid_refs_file(pid_ref_abs_path) From ebfe61098533b5383c08639c7ff049f80252aea3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 14:10:26 -0800 Subject: [PATCH 044/420] Refactor 'tag_object' process to be atomic and clean up code --- src/hashstore/filehashstore.py | 87 +++++++++++++++----------- tests/test_filehashstore.py | 17 ++--- tests/test_filehashstore_interface.py | 3 +- tests/test_filehashstore_references.py | 52 +++++++-------- 4 files changed, 89 insertions(+), 70 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d280e594..e8e265c9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -515,12 +515,29 @@ def tag_object(self, pid, cid): # If it does, read the file and add the new pid on its own line self.update_cid_refs(cid_ref_abs_path, pid) else: - # If not, create the pid ref file in '.../refs/pid' with the cid as its content + # All ref files begin as tmp files and get moved sequentially at once + # Ensure refs tmp folder exists + tmp_root_path = self.get_store_path("refs") / "tmp" + # Physically create directory if it doesn't exist + if os.path.exists(tmp_root_path) is False: + self.create_path(tmp_root_path) + + # Then write pid_refs_file content into tmp file + pid_tmp_file = self._mktmpfile(tmp_root_path) + pid_tmp_file_path = pid_tmp_file.name + self._write_pid_refs_file(pid_tmp_file_path, cid) + # Then write cid_refs_file content into tmp file + cid_tmp_file = self._mktmpfile(tmp_root_path) + cid_tmp_file_path = cid_tmp_file.name + self._write_cid_refs_file(cid_tmp_file_path, pid) + + # Create path for pid ref file in '.../refs/pid' self.create_path(os.path.dirname(pid_ref_abs_path)) - self.write_pid_refs_file(pid_ref_abs_path, cid) - # Then create the cid ref file in '.../refs/cid' and write the pid + # Create path for cid ref file in '.../refs/cid' self.create_path(os.path.dirname(cid_ref_abs_path)) - self.write_cid_refs_file(cid_ref_abs_path, pid) + # Move both files + shutil.move(pid_tmp_file_path, pid_ref_abs_path) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) return True finally: # TODO: Verify that the reference files have been written as expected. @@ -676,13 +693,13 @@ def delete_object(self, pid): try: # Remove pid from cid reference file cid_ref_abs_path = self.get_refs_abs_path("cid", cid) - self.delete_cid_refs_pid(cid_ref_abs_path, pid) + self._delete_cid_refs_pid(cid_ref_abs_path, pid) # Delete cid reference file # If the file is not empty, it will not be deleted. - cid_refs_deleted = self.delete_cid_refs_file(cid_ref_abs_path) + cid_refs_deleted = self._delete_cid_refs_file(cid_ref_abs_path) # Delete pid reference file pid_ref_abs_path = self.get_refs_abs_path("pid", pid) - self.delete_pid_refs_file(pid_ref_abs_path) + self._delete_pid_refs_file(pid_ref_abs_path) # Finally, delete the object if cid_refs_deleted: entity = "objects" @@ -1115,22 +1132,20 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - def write_cid_refs_file(self, cid_ref_abs_path, pid): - """Write the reference file for the given content identifier (cid). A reference - file contains every pid that references a cid each on its own line. + def _write_cid_refs_file(self, path, pid): + """Write the reference file in the supplied path for the given content + identifier (cid). A reference file contains every pid that references a + cid each on its own line. Args: - cid_ref_abs_path (string): Absolute path to the cid ref file + path (string): Path of file to be written into pid (string): Authority-based or persistent identifier of object """ - info_msg = ( - f"FileHashStore - write_cid_refs_file: Writing pid ({pid}) into cid reference" - + f" file: {cid_ref_abs_path}" - ) + info_msg = f"FileHashStore - write_cid_refs_file: Writing pid ({pid}) into file: {path}" logging.info(info_msg) try: - with open(cid_ref_abs_path, "w", encoding="utf8") as cid_ref_file: + with open(path, "w", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock @@ -1140,13 +1155,13 @@ def write_cid_refs_file(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - write_cid_refs_file: failed to write reference for cid:" - + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + f"FileHashStore - write_cid_refs_file: failed to write pid ({pid})" + + f" into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - def update_cid_refs(self, cid_ref_abs_path, pid): + def _update_cid_refs(self, cid_ref_abs_path, pid): """Update an existing cid reference file with the given pid. Args: @@ -1186,7 +1201,7 @@ def update_cid_refs(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - def delete_cid_refs_pid(self, cid_ref_abs_path, pid): + def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): """Delete a pid from a cid reference file. Args: @@ -1228,7 +1243,7 @@ def delete_cid_refs_pid(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - def delete_cid_refs_file(self, cid_ref_abs_path): + def _delete_cid_refs_file(self, cid_ref_abs_path): """Delete a cid reference file. There must be no references remaining. Args: @@ -1269,19 +1284,17 @@ def delete_cid_refs_file(self, cid_ref_abs_path): logging.error(exception_string) raise err - def write_pid_refs_file(self, path, cid): - """Write the reference file for the given pid (persistent identifier). A reference - file for a pid contains the cid that it references. Its permanent address is the pid - hash with HashStore's default store algorithm and follows its directory structure. + def _write_pid_refs_file(self, path, cid): + """Write the reference file in the supplied path for the given pid (persistent + identifier). A reference file for a pid contains the cid that it references. + Its permanent address is the pid hash using HashStore's default store algorithm + and follows its directory structure. Args: - path (string): Path to file to be written into + path (string): Path of file to be written into cid (string): Content identifier """ - info_msg = ( - f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into pid reference" - + f" file: {path}" - ) + info_msg = f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into file: {path}" logging.info(info_msg) try: @@ -1295,13 +1308,13 @@ def write_pid_refs_file(self, path, cid): except Exception as err: exception_string = ( - "FileHashStore - write_pid_refs_file: failed to write pid reference file:" - + f" {path} for cid: {cid}. Unexpected {err=}, {type(err)=}" + f"FileHashStore - write_pid_refs_file: failed to write cid ({cid})" + + f" into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - def delete_pid_refs_file(self, pid_ref_abs_path): + def _delete_pid_refs_file(self, pid_ref_abs_path): """Delete a pid reference file. Args: @@ -1498,7 +1511,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): self.clean_algorithm(checksum_algorithm) if checksum_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: checksum algorithm: {checksum_algorithm}" + f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" + " found in other_algo_lists, adding to list of algorithms to calculate." ) logging.debug(debug_additional_other_algo_str) @@ -1507,7 +1520,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): self.clean_algorithm(additional_algorithm) if additional_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: additional algorithm: {additional_algorithm}" + f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" + " found in other_algo_lists, adding to list of algorithms to calculate." ) logging.debug(debug_additional_other_algo_str) @@ -1764,9 +1777,9 @@ def delete(self, entity, file): except OSError: pass else: - self.remove_empty(os.path.dirname(realpath)) + self._remove_empty(os.path.dirname(realpath)) - def remove_empty(self, subpath): + def _remove_empty(self, subpath): """Successively remove all empty folders starting with `subpath` and proceeding "up" through directory tree until reaching the `root` folder. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1b0116fa..801257b4 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -738,9 +738,10 @@ def test_remove_empty_removes_empty_folders_string(store): assert os.path.exists(os.path.join(store.root, three_dirs)) assert os.path.exists(os.path.join(store.root, two_dirs)) assert os.path.exists(os.path.join(store.root, one_dir)) - store.remove_empty(os.path.join(store.root, three_dirs)) - store.remove_empty(os.path.join(store.root, two_dirs)) - store.remove_empty(os.path.join(store.root, one_dir)) + # pylint: disable=W0212 + store._remove_empty(os.path.join(store.root, three_dirs)) + store._remove_empty(os.path.join(store.root, two_dirs)) + store._remove_empty(os.path.join(store.root, one_dir)) assert not os.path.exists(os.path.join(store.root, three_dirs)) assert not os.path.exists(os.path.join(store.root, two_dirs)) assert not os.path.exists(os.path.join(store.root, one_dir)) @@ -757,9 +758,10 @@ def test_remove_empty_removes_empty_folders_path(store): assert (store.root / three_dirs).exists() assert (store.root / two_dirs).exists() assert (store.root / one_dir).exists() - store.remove_empty(store.root / three_dirs) - store.remove_empty(store.root / two_dirs) - store.remove_empty(store.root / one_dir) + # pylint: disable=W0212 + store._remove_empty(store.root / three_dirs) + store._remove_empty(store.root / two_dirs) + store._remove_empty(store.root / one_dir) assert not (store.root / three_dirs).exists() assert not (store.root / two_dirs).exists() assert not (store.root / one_dir).exists() @@ -776,7 +778,8 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): # Get parent directory of the relative path parent_dir = os.path.dirname(object_metadata_shard_path) # Attempt to remove the parent directory - store.remove_empty(parent_dir) + # pylint: disable=W0212 + store._remove_empty(parent_dir) abs_parent_dir = store.objects + "/" + parent_dir assert os.path.exists(abs_parent_dir) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index e7ead830..ac806545 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -953,7 +953,8 @@ def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): cid = object_metadata.id store.tag_object(pid, cid) cid_refs_abs_path = store.get_refs_abs_path("cid", cid) - store.update_cid_refs(cid_refs_abs_path, "dou.test.1") + # pylint: disable=W0212 + store._update_cid_refs(cid_refs_abs_path, "dou.test.1") _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) cid_refs_file_path = store.get_refs_abs_path("cid", cid) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 63fc54da..4b86055c 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -2,6 +2,8 @@ import os import pytest +# pylint: disable=W0212 + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" @@ -9,7 +11,7 @@ def test_write_cid_refs_file(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) assert os.path.exists(cid_ref_abs_path) @@ -19,7 +21,7 @@ def test_write_cid_refs_file_content(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() @@ -33,10 +35,10 @@ def test_update_cid_refs_content(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) + store._update_cid_refs(cid_ref_abs_path, pid_other) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -50,11 +52,11 @@ def test_update_cid_refs_content_multiple(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) cid_reference_list = [pid] for i in range(0, 5): - store.update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") cid_reference_list.append(f"dou.test.{i}") line_count = 0 @@ -73,9 +75,9 @@ def test_update_cid_refs_content_pid_exists(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) with pytest.raises(ValueError): - store.update_cid_refs(cid_ref_abs_path, pid) + store._update_cid_refs(cid_ref_abs_path, pid) def test_delete_cid_refs_pid(pids, store): @@ -84,11 +86,11 @@ def test_delete_cid_refs_pid(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) - store.delete_cid_refs_pid(cid_ref_abs_path, pid) + store._update_cid_refs(cid_ref_abs_path, pid_other) + store._delete_cid_refs_pid(cid_ref_abs_path, pid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -103,12 +105,12 @@ def test_delete_cid_refs_pid_pid_not_found(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) + store._update_cid_refs(cid_ref_abs_path, pid_other) with pytest.raises(ValueError): - store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") + store._delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") def test_delete_cid_refs_pid_file(pids, store): @@ -117,9 +119,9 @@ def test_delete_cid_refs_pid_file(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - store.delete_cid_refs_pid(cid_ref_abs_path, pid) - cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + store._write_cid_refs_file(cid_ref_abs_path, pid) + store._delete_cid_refs_pid(cid_ref_abs_path, pid) + cid_refs_deleted = store._delete_cid_refs_file(cid_ref_abs_path) assert cid_refs_deleted assert not os.path.exists(cid_ref_abs_path) @@ -132,8 +134,8 @@ def test_delete_cid_refs_pid_file_not_empty(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + store._write_cid_refs_file(cid_ref_abs_path, pid) + cid_refs_deleted = store._delete_cid_refs_file(cid_ref_abs_path) assert not cid_refs_deleted @@ -143,7 +145,7 @@ def test_delete_cid_refs_pid_file_not_found(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) with pytest.raises(FileNotFoundError): - store.delete_cid_refs_file(cid_ref_abs_path) + store._delete_cid_refs_file(cid_ref_abs_path) def test_write_pid_refs_file(pids, store): @@ -152,7 +154,7 @@ def test_write_pid_refs_file(pids, store): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) + store._write_pid_refs_file(pid_ref_abs_path, cid) assert os.path.exists(pid_ref_abs_path) @@ -162,7 +164,7 @@ def test_write_pid_refs_file_content(pids, store): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) + store._write_pid_refs_file(pid_ref_abs_path, cid) with open(pid_ref_abs_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() @@ -176,8 +178,8 @@ def test_delete_pid_refs_file(pids, store): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - store.delete_pid_refs_file(pid_ref_abs_path) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store._delete_pid_refs_file(pid_ref_abs_path) assert not os.path.exists(pid_ref_abs_path) @@ -187,4 +189,4 @@ def test_delete_pid_refs_file_file_not_found(pids, store): for pid in pids.keys(): pid_ref_abs_path = store.get_refs_abs_path("pid", pid) with pytest.raises(FileNotFoundError): - store.delete_cid_refs_file(pid_ref_abs_path) + store._delete_cid_refs_file(pid_ref_abs_path) From ea59f3e57120adbbb5ce3a87f0f717bf81a345ff Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 15:16:41 -0800 Subject: [PATCH 045/420] Add new method '_validate_references' that is now called after atomically moving refs files in 'tag_object' --- src/hashstore/filehashstore.py | 52 +++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e8e265c9..0a5b0409 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -538,9 +538,9 @@ def tag_object(self, pid, cid): # Move both files shutil.move(pid_tmp_file_path, pid_ref_abs_path) shutil.move(cid_tmp_file_path, cid_ref_abs_path) + self._validate_references(pid, cid) return True finally: - # TODO: Verify that the reference files have been written as expected. # Release cid with self.reference_lock: logging.debug( @@ -1578,6 +1578,56 @@ def _validate_object( logging.error(exception_string) raise ValueError(exception_string) + def _validate_references(self, pid, cid): + """Verifies that the supplied pid and pid reference file and content have been + written successfully. + + Args: + pid (string): Authority-based or persistent identifier + cid (string): Content identifier + """ + # Check that reference files were created + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + if not os.path.exists(pid_ref_abs_path): + exception_string = ( + "FileHashStore - _validate_references: Pid refs file missing: %s", + pid_ref_abs_path, + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) + if not os.path.exists(cid_ref_abs_path): + exception_string = ( + "FileHashStore - _validate_references: Cid refs file missing: %s", + cid_ref_abs_path, + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) + # Check the content of the reference files + # Start with the cid + retrieved_cid = self.find_object(pid) + if retrieved_cid != cid: + exception_string = ( + f"FileHashStore - _validate_references: Pid refs file exists ({pid_ref_abs_path})" + + f" but cid ({cid}) does not match." + ) + logging.error(exception_string) + raise ValueError(exception_string) + # Then the pid + pid_found = False + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + if value == pid: + pid_found = True + if not pid_found: + exception_string = ( + f"FileHashStore - _validate_references: Cid refs file exists ({cid_ref_abs_path})" + + f" but pid ({pid}) not found." + ) + logging.error(exception_string) + raise ValueError(exception_string) + def _validate_metadata_to_store(self, metadata): """Evaluates a metadata argument to ensure that it is either a string, path or stream object before attempting to store it. From cef6e933f89add7ae20e3cacda5e767ddb76d333 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 15:47:49 -0800 Subject: [PATCH 046/420] Refactor '_delete_cid_refs_file', revise pytests and add new pytests for '_validate_references' --- src/hashstore/filehashstore.py | 7 ++- tests/test_filehashstore_interface.py | 7 ++- tests/test_filehashstore_references.py | 76 ++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 0a5b0409..9ebf046e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1264,14 +1264,15 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): "FileHashStore - delete_cid_refs_file: Cid reference file not found: %s", cid_ref_abs_path, ) + logging.error(err_msg) raise FileNotFoundError(err_msg) if os.path.getsize(cid_ref_abs_path) != 0: - warn_msg = ( + err_msg = ( "FileHashStore - delete_cid_refs_file: Failed to delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " ) - logging.warning(warn_msg) - return False + logging.error(err_msg) + raise OSError(err_msg) else: os.remove(cid_ref_abs_path) return True diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index ac806545..afc63d53 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -956,9 +956,10 @@ def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") _metadata_cid = store.store_metadata(pid, syspath, format_id) - store.delete_object(pid) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) - assert os.path.exists(cid_refs_file_path) + with pytest.raises(OSError): + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) def test_delete_object_pid_empty(store): diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 4b86055c..8301c030 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -70,7 +70,7 @@ def test_update_cid_refs_content_multiple(pids, store): def test_update_cid_refs_content_pid_exists(pids, store): - """Test that update_cid_ref does not write pid if pid already exists""" + """Test that update_cid_ref throws exception if pid already exists.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) @@ -128,15 +128,14 @@ def test_delete_cid_refs_pid_file(pids, store): def test_delete_cid_refs_pid_file_not_empty(pids, store): - """Test that delete_cid_refs_file does not raise an exception when refs file - is not empty.""" + """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store._write_cid_refs_file(cid_ref_abs_path, pid) - cid_refs_deleted = store._delete_cid_refs_file(cid_ref_abs_path) - assert not cid_refs_deleted + with pytest.raises(OSError): + store._delete_cid_refs_file(cid_ref_abs_path) def test_delete_cid_refs_pid_file_not_found(pids, store): @@ -190,3 +189,70 @@ def test_delete_pid_refs_file_file_not_found(pids, store): pid_ref_abs_path = store.get_refs_abs_path("pid", pid) with pytest.raises(FileNotFoundError): store._delete_cid_refs_file(pid_ref_abs_path) + + +def test_validate_references_pid_refs_file_missing(pids, store): + """Test that validate_references throws exception when pid refs file is missing.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + with pytest.raises(FileNotFoundError): + store._validate_references(pid, cid) + + +def test_validate_references_pid_refs_incorrect_cid(pids, store): + """Test that validate_references throws exception when pid refs file cid is incorrect.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, "bad_cid") + with pytest.raises(FileNotFoundError): + store._validate_references(pid, cid) + + +def test_validate_references_cid_refs_file_missing(pids, store): + """Test that validate_references throws exception when cid refs file is missing.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + with pytest.raises(FileNotFoundError): + store._validate_references(pid, cid) + + +def test_validate_references_cid_refs_file_missing_pid(pids, store): + """Test that validate_references throws exception when cid refs file does not contain + the expected pid.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") + with pytest.raises(ValueError): + store._validate_references(pid, cid) + + +def test_validate_references_cid_refs_file_with_multiple_refs_missing_pid(pids, store): + """Test that validate_references throws exception when cid refs file with multiple + references does not contain the expected pid.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") + + cid_reference_list = [pid] + for i in range(0, 5): + store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + cid_reference_list.append(f"dou.test.{i}") + + with pytest.raises(ValueError): + store._validate_references(pid, cid) From b567d5b0218b658985f07de329eb28c594bc267f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 09:39:31 -0800 Subject: [PATCH 047/420] Add pytests for 'store_data_only' --- src/hashstore/filehashstore.py | 8 ------ tests/test_filehashstore.py | 48 +++++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9ebf046e..e4b15d1d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -501,7 +501,6 @@ def tag_object(self, pid, cid): self.reference_locked_cids.append(cid) try: # TODO: Review process and test what happens when specific pieces fail - # We cannot have a pid ref file whose pid is not referenced in the cid refs file pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if os.path.exists(pid_ref_abs_path): @@ -839,13 +838,6 @@ def store_data_only(self, data): """ logging.debug("FileHashStore - store_object: Request to store object.") - # TODO: Missing Tests - # - Test that this method returns hex digests and that they are correct - # - Test that objects are actually stored with their cid - # - Test that exception is raised when object fails to store - # - Test that exception is raised when object already exists - # - Test providing the data as a file path - # - Test providing the data as a stream try: # Ensure the data is a stream stream = Stream(data) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 801257b4..8c448ad4 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -206,7 +206,7 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): - """Test put objects with path object.""" + """Test store_and_validate_data objects with path object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -217,7 +217,7 @@ def test_store_and_validate_data_files_path(pids, store): def test_store_and_validate_data_files_string(pids, store): - """Test put objects with string.""" + """Test store_and_validate_data objects with string.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -228,7 +228,7 @@ def test_store_and_validate_data_files_string(pids, store): def test_store_and_validate_data_files_stream(pids, store): - """Test put objects with stream.""" + """Test store_and_validate_data objects with stream.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -242,7 +242,7 @@ def test_store_and_validate_data_files_stream(pids, store): def test_store_and_validate_data_cid(pids, store): - """Check put returns correct id.""" + """Check store_and_validate_data returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -252,7 +252,7 @@ def test_store_and_validate_data_cid(pids, store): def test_store_and_validate_data_file_size(pids, store): - """Check put returns correct file size.""" + """Check store_and_validate_data returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -262,7 +262,7 @@ def test_store_and_validate_data_file_size(pids, store): def test_store_and_validate_data_hex_digests(pids, store): - """Check put successfully generates hex digests dictionary.""" + """Check store_and_validate_data successfully generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -303,7 +303,7 @@ def test_store_and_validate_data_with_correct_checksums(pids, store): def test_store_and_validate_data_with_incorrect_checksum(pids, store): - """Check put fails when bad checksum supplied.""" + """Check store_and_validate_data fails when bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -317,6 +317,40 @@ def test_store_and_validate_data_with_incorrect_checksum(pids, store): assert store.count(entity) == 0 +def test_store_data_only_cid(pids, store): + """Check store_data_only returns correct id.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_metadata_id = object_metadata.id + assert object_metadata_id == pids[pid][store.algorithm] + + +def test_store_data_only_file_size(pids, store): + """Check store_data_only returns correct file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_size = object_metadata.obj_size + assert object_size == pids[pid]["file_size_bytes"] + + +def test_store_data_only_hex_digests(pids, store): + """Check store_data_only generates hex digests dictionary.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_metadata_hex_digests = object_metadata.hex_digests + assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata_hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata_hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] + + def test_move_and_get_checksums_id(pids, store): """Test move returns correct id.""" test_dir = "tests/testdata/" From 348c536c9c0a5f43b9c17bbf8ac837fcc1878461 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 13:41:27 -0800 Subject: [PATCH 048/420] Clean up comments, code and logging statements --- src/hashstore/filehashstore.py | 55 ++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e4b15d1d..a6efed56 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -823,7 +823,8 @@ def store_and_validate_data( def store_data_only(self, data): """Store an object to HashStore and return the id and a hex digest - dictionary of the default algorithms. + dictionary of the default algorithms. This method does not validate the + object and writes directly to /objects after the hex digests are calculated. Args: data (mixed): String or path to object. @@ -836,7 +837,9 @@ def store_data_only(self, data): object_metadata (ObjectMetadata): object that contains the object id, object file size and hex digest dictionary. """ - logging.debug("FileHashStore - store_object: Request to store object.") + logging.debug( + "FileHashStore - store_object: Request to store data object only." + ) try: # Ensure the data is a stream @@ -902,10 +905,6 @@ def _move_and_get_checksums( Returns: object_metadata (tuple): object id, object file size and hex digest dictionary. """ - # TODO: If the checksum algorithm is the same as the store algorithm, then we can - # determine whether the object exists or not to be efficient - - # Create temporary file and calculate hex digests debug_msg = ( "FileHashStore - _move_and_get_checksums: Creating temp" + f" file and calculating checksums for pid: {pid}" @@ -993,8 +992,8 @@ def _move_and_get_checksums( else: # Else delete temporary file exception_string = ( - f"FileHashStore - _move_and_get_checksums: Object exists at: {abs_file_path}," - + " deleting temporary file." + "FileHashStore - _move_and_get_checksums: Object already exists at:" + + f" {abs_file_path}, deleting temporary file." ) logging.error(exception_string) self.delete(entity, tmp_file_name) @@ -1133,8 +1132,11 @@ def _write_cid_refs_file(self, path, pid): path (string): Path of file to be written into pid (string): Authority-based or persistent identifier of object """ - info_msg = f"FileHashStore - write_cid_refs_file: Writing pid ({pid}) into file: {path}" - logging.info(info_msg) + logging.debug( + "FileHashStore - write_cid_refs_file: Writing pid (%s) into file: %s", + pid, + path, + ) try: with open(path, "w", encoding="utf8") as cid_ref_file: @@ -1160,11 +1162,11 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object """ - info_msg = ( - f"FileHashStore - update_cid_refs: Adding pid ({pid}) into cid reference" - + f" file: {cid_ref_abs_path}" + logging.debug( + "FileHashStore - update_cid_refs: Adding pid (%s) into cid reference file: %s", + pid, + cid_ref_abs_path, ) - logging.info(info_msg) try: with open(cid_ref_abs_path, "r", encoding="utf8") as f: @@ -1200,11 +1202,11 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object """ - info_msg = ( - f"FileHashStore - delete_cid_refs_pid: Deleting pid ({pid}) from cid reference" - + f" file: {cid_ref_abs_path}" + logging.debug( + "FileHashStore - delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", + pid, + cid_ref_abs_path, ) - logging.info(info_msg) try: with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: @@ -1244,11 +1246,10 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): Returns: boolean: True if deleted, False if not """ - info_msg = ( + logging.debug( "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", cid_ref_abs_path, ) - logging.info(info_msg) try: if not os.path.exists(cid_ref_abs_path): @@ -1287,8 +1288,11 @@ def _write_pid_refs_file(self, path, cid): path (string): Path of file to be written into cid (string): Content identifier """ - info_msg = f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into file: {path}" - logging.info(info_msg) + logging.debug( + "FileHashStore - write_pid_refs_file: Writing cid (%s) into file: %s", + cid, + path, + ) try: with open(path, "w", encoding="utf8") as pid_ref_file: @@ -1313,11 +1317,10 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): Args: pid_ref_abs_path (string): Absolute path to the pid ref file """ - info_msg = ( + logging.debug( "FileHashStore - delete_pid_refs_file: Deleting reference file: %s", pid_ref_abs_path, ) - logging.info(info_msg) try: if not os.path.exists(pid_ref_abs_path): @@ -1551,7 +1554,7 @@ def _validate_object( if file_size_to_validate != tmp_file_size: self.delete(entity, tmp_file_name) exception_string = ( - "FileHashStore - _move_and_get_checksums: Object file size calculated: " + "FileHashStore - _validate_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" + f"{file_size_to_validate}. Tmp file deleted and file not stored for" + f" pid: {pid}" @@ -1563,7 +1566,7 @@ def _validate_object( if hex_digest_stored != checksum: self.delete(entity, tmp_file_name) exception_string = ( - "FileHashStore - _move_and_get_checksums: Hex digest and checksum" + "FileHashStore - _validate_object: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + f" HexDigest: {hex_digest_stored}. Tmp file deleted." From 3766155ef3f80541b041d9489ca48835654d2cb0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 14:00:33 -0800 Subject: [PATCH 049/420] Refactor '_validate_object' method and update docstrings --- src/hashstore/filehashstore.py | 51 +++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a6efed56..3b54c48d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1537,42 +1537,55 @@ def _validate_object( tmp_file_size, file_size_to_validate, ): - """Evaluates an object's integrity + """Evaluates an object's integrity and throws exception if there is a mismatch. Args: - pid: For logging purposes - checksum: Value of checksum - checksum_algorithm: Algorithm of checksum - entity: Type of object - hex_digests: Dictionary of hex digests to select from - tmp_file_name: Name of tmp file - tmp_file_size: Size of the tmp file - file_size_to_validate: Expected size of the object + pid (string): For logging purposes + checksum (string): Value of checksum to check + checksum_algorithm (string): Algorithm of checksum + entity (string): Type of object ('objects' or 'metadata') + hex_digests (dictionary): Dictionary of hex digests to parse + tmp_file_name (string): Name of tmp file + tmp_file_size (int): Size of the tmp file + file_size_to_validate (int): Expected size of the object """ - # TODO: Refactor this method and/or create a new method for Metacat client to call if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: - self.delete(entity, tmp_file_name) exception_string = ( "FileHashStore - _validate_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" - + f"{file_size_to_validate}. Tmp file deleted and file not stored for" - + f" pid: {pid}" + + f"{file_size_to_validate}." ) - logging.error(exception_string) - raise ValueError(exception_string) + if pid is not None: + self.delete(entity, tmp_file_name) + exception_string_for_pid = ( + exception_string + + f" Tmp file deleted and file not stored for pid: {pid}" + ) + logging.error(exception_string_for_pid) + raise ValueError(exception_string_for_pid) + else: + logging.error(exception_string) + raise ValueError(exception_string) if checksum_algorithm is not None and checksum is not None: hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum: - self.delete(entity, tmp_file_name) exception_string = ( "FileHashStore - _validate_object: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" - + f" HexDigest: {hex_digest_stored}. Tmp file deleted." + + f" HexDigest: {hex_digest_stored}." ) - logging.error(exception_string) - raise ValueError(exception_string) + if pid is not None: + self.delete(entity, tmp_file_name) + exception_string_for_pid = ( + exception_string + f"Tmp file ({tmp_file_name}) deleted." + ) + logging.error(exception_string_for_pid) + raise ValueError(exception_string_for_pid) + else: + logging.error(exception_string) + raise ValueError(exception_string) def _validate_references(self, pid, cid): """Verifies that the supplied pid and pid reference file and content have been From 3b5275b807b2b4a670dda1ffd9401f0307195cc1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 14:15:21 -0800 Subject: [PATCH 050/420] Add new method 'verify_object' to allow caller to validate an object's integrity --- src/hashstore/filehashstore.py | 42 +++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3b54c48d..93339569 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -477,6 +477,46 @@ def store_object( return object_metadata + def verify_object( + self, object_metadata, checksum, checksum_algorithm, expected_file_size + ): + """Confirms that a object_metadata's content is equal to the given values. + + Args: + object_metadata (ObjectMetadata): object_metadata object + checksum (string): Value of checksum + checksum_algorithm (string): Algorithm of checksum + expected_file_size (int): Size of the tmp file + """ + logging.debug( + "FileHashStore - verify_object: Called to verify object with id: %s", + object_metadata.id, + ) + self._is_string_none_or_empty(checksum, "checksum", "verify_object") + self._is_string_none_or_empty( + checksum_algorithm, "checksum_algorithm", "verify_object" + ) + self._validate_file_size(expected_file_size) + if object_metadata is None or not isinstance(ObjectMetadata): + raise ValueError( + "FileHashStore - verify_object: 'object_metadata' cannot be None." + + " Must be a 'ObjectMetadata' object." + ) + else: + object_metadata_hex_digests = object_metadata.hex_digests + object_metadata_file_size = object_metadata.obj_size + checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + self._validate_object( + pid=None, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + entity=None, + hex_digests=object_metadata_hex_digests, + tmp_file_name=None, + tmp_file_size=object_metadata_file_size, + file_size_to_validate=expected_file_size, + ) + def tag_object(self, pid, cid): logging.debug( "FileHashStore - tag_object: Tagging object cid: {%s} with pid: {%s}.", @@ -1994,7 +2034,7 @@ def count(self, entity): @staticmethod def _validate_file_size(file_size): - """Checks whether a file size is > 0 and an int and throws exception if not. + """Checks whether a given argument is an integer and > 0 and throws exception if not. Args: file_size (int): file size to check From f8d142543a751f23c7b1455bd7b97439dbd4876a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 14:40:41 -0800 Subject: [PATCH 051/420] Clean up code and add TODO items --- src/hashstore/filehashstore.py | 45 +++++++++++++++++--------- tests/test_filehashstore_references.py | 4 +-- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 93339569..a4c37dbd 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -18,8 +18,7 @@ class FileHashStore(HashStore): """FileHashStore is a content addressable file manager based on Derrick Gilland's 'hashfs' library. It supports the storage of objects on disk using - an authority-based identifier's hex digest with a given hash algorithm value - to address files. + a content identifier to address files. FileHashStore initializes using a given properties dictionary containing the required keys (see Args). Upon initialization, FileHashStore verifies the provided @@ -113,6 +112,7 @@ def __init__(self, properties=None): if not os.path.exists(self.metadata): self.create_path(self.metadata + "/tmp") if not os.path.exists(self.refs): + self.create_path(self.refs + "/tmp") self.create_path(self.refs + "/pid") self.create_path(self.refs + "/cid") logging.debug( @@ -488,6 +488,7 @@ def verify_object( checksum_algorithm (string): Algorithm of checksum expected_file_size (int): Size of the tmp file """ + # TODO: Write tests logging.debug( "FileHashStore - verify_object: Called to verify object with id: %s", object_metadata.id, @@ -540,10 +541,10 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - # TODO: Review process and test what happens when specific pieces fail pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if os.path.exists(pid_ref_abs_path): + # A pid reference file can only contain one cid exception_string = ( "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", pid_ref_abs_path, @@ -551,13 +552,12 @@ def tag_object(self, pid, cid): logging.error(exception_string) raise FileExistsError(exception_string) elif os.path.exists(cid_ref_abs_path): - # If it does, read the file and add the new pid on its own line + # Update cid ref files if it already exists self.update_cid_refs(cid_ref_abs_path, pid) else: # All ref files begin as tmp files and get moved sequentially at once # Ensure refs tmp folder exists tmp_root_path = self.get_store_path("refs") / "tmp" - # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) @@ -565,7 +565,7 @@ def tag_object(self, pid, cid): pid_tmp_file = self._mktmpfile(tmp_root_path) pid_tmp_file_path = pid_tmp_file.name self._write_pid_refs_file(pid_tmp_file_path, cid) - # Then write cid_refs_file content into tmp file + # Then write cid_refs_file content into another tmp file cid_tmp_file = self._mktmpfile(tmp_root_path) cid_tmp_file_path = cid_tmp_file.name self._write_cid_refs_file(cid_tmp_file_path, pid) @@ -577,6 +577,8 @@ def tag_object(self, pid, cid): # Move both files shutil.move(pid_tmp_file_path, pid_ref_abs_path) shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure that the reference files have been written as expected + # If there is an issue, client or user will have to manually review self._validate_references(pid, cid) return True finally: @@ -967,9 +969,10 @@ def _move_and_get_checksums( object_cid = hex_digests.get(self.algorithm) abs_file_path = self.build_abs_path(entity, object_cid, extension) - # Only move file if it doesn't exist. - # Files are stored once and only once + # Only move file if it doesn't exist. We do not check before we create the tmp + # file and calculate the hex digests because the given checksum could be incorrect. if not os.path.isfile(abs_file_path): + # Files are stored once and only once self._validate_object( pid, checksum, @@ -1164,9 +1167,8 @@ def delete_tmp_file(): return tmp def _write_cid_refs_file(self, path, pid): - """Write the reference file in the supplied path for the given content - identifier (cid). A reference file contains every pid that references a - cid each on its own line. + """Write the cid reference file in the supplied path. A reference file contains + every pid that references a cid each on its own line. Args: path (string): Path of file to be written into @@ -1178,6 +1180,11 @@ def _write_cid_refs_file(self, path, pid): path, ) + # TODO: Check that the given path does not contain any data before writing + # This method only writes a new cid refs file and should not overwrite + # an existing one. + # TODO: Write test to confirm exception is thrown when path contains data + try: with open(path, "w", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) @@ -1208,6 +1215,10 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): cid_ref_abs_path, ) + # TODO: Throw exception if the file doesn't exist. This method should only + # proceed when there is an existing cid refs file. + # TODO: Write test to check for exception thrown + try: with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -1319,10 +1330,8 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): raise err def _write_pid_refs_file(self, path, cid): - """Write the reference file in the supplied path for the given pid (persistent + """Write the pid reference file in the supplied path for the given cid (content identifier). A reference file for a pid contains the cid that it references. - Its permanent address is the pid hash using HashStore's default store algorithm - and follows its directory structure. Args: path (string): Path of file to be written into @@ -1988,7 +1997,11 @@ def build_abs_path(self, entity, hash_id, extension=""): return absolute_path def get_refs_abs_path(self, ref_type, hash_id): - """Get the absolute path to the reference file for the given pid. + """Get the absolute path to the reference file for the given ref_type. If a + 'pid' is provided, this method will calculate the pid's hash based on the store + algorithm, and return the expected address of the pid reference file. If a + 'cid' is provided, this method will return the expected address by sharding the + cid based on HashStore's configuration. Args: ref_type (string): 'pid' or 'cid' @@ -2060,7 +2073,7 @@ def _is_string_none_or_empty(string, arg, method): Args: string (string): Value to check - arg (): Name of argument to check + arg (string): Name of argument to check method (string): Calling method for logging purposes """ if string is None or string.replace(" ", "") == "": diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 8301c030..9051d9e3 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -127,7 +127,7 @@ def test_delete_cid_refs_pid_file(pids, store): assert not os.path.exists(cid_ref_abs_path) -def test_delete_cid_refs_pid_file_not_empty(pids, store): +def test_delete_cid_refs_file_file_not_empty(pids, store): """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" for pid in pids.keys(): cid = pids[pid]["sha256"] @@ -138,7 +138,7 @@ def test_delete_cid_refs_pid_file_not_empty(pids, store): store._delete_cid_refs_file(cid_ref_abs_path) -def test_delete_cid_refs_pid_file_not_found(pids, store): +def test_delete_cid_refs_file_file_not_found(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): cid = pids[pid]["sha256"] From 68ba2a781bc0d8ccaaf48a939869e1ac8324609f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 16 Nov 2023 10:53:18 -0800 Subject: [PATCH 052/420] Clean-up test modules' comments and docstrings and move 'tag_object' test methods to 'test_filehashstore_references' --- src/hashstore/hashstore.py | 2 +- tests/test_filehashstore.py | 2 +- tests/test_filehashstore_interface.py | 93 +------------------------- tests/test_filehashstore_references.py | 93 +++++++++++++++++++++++++- tests/test_filehashstore_stream.py | 2 +- tests/test_hashstore.py | 8 +-- tests/test_hashstore_client.py | 2 +- 7 files changed, 101 insertions(+), 101 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 130c1304..b1851d0e 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -33,7 +33,7 @@ def store_object( The file's id is determined by calculating the object's content identifier based on the store's default algorithm, which is also used as the permanent address of the file. - The file's identifier is then sharded using a depth of 3 and width of 2, + The file's identifier is then sharded using the store's configured depth and width, delimited by '/' and concatenated to produce the final permanent address and is stored in the `/store_directory/objects/` directory. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 8c448ad4..59c8b1ac 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1,4 +1,4 @@ -"""Test module for FileHashStore core, utility and supporting methods""" +"""Test module for FileHashStore core, utility and supporting methods.""" import io import os from pathlib import Path diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index afc63d53..566849da 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,4 +1,4 @@ -"""Test module for FileHashStore HashStore interface methods""" +"""Test module for FileHashStore HashStore interface methods.""" import io import os from pathlib import Path @@ -521,97 +521,6 @@ def test_store_object_sparse_large_file(store): assert object_metadata_id == object_metadata.hex_digests.get("sha256") -def test_tag_object(pids, store): - """Test tag object returns boolean.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - object_tagged = store.tag_object(pid, object_metadata.id) - assert object_tagged - - -def test_tag_object_pid_refs_file(pids, store): - """Test tag object creates the pid reference file.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) - assert os.path.exists(pid_refs_file_path) - - -def test_tag_object_pid_refs_file_exists(pids, store): - """Test tag object throws exception when pid refs file already exists.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.id - store.tag_object(pid, cid) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) - assert os.path.exists(pid_refs_file_path) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) - assert os.path.exists(cid_refs_file_path) - with pytest.raises(FileExistsError): - store.tag_object(pid, cid) - - -def test_tag_object_pid_refs_file_content(pids, store): - """Test tag object creates the pid reference file contains the correct cid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) - with open(pid_refs_file_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - assert pid_refs_cid == object_metadata.id - - -def test_tag_object_cid_refs_file(pids, store): - """Test tag object creates the cid reference file.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.id - store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) - assert os.path.exists(cid_refs_file_path) - - -def test_tag_object_cid_refs_file_content(pids, store): - """Test tag object tags cid reference file successfully with pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) - with open(cid_refs_file_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read().strip() - assert pid_refs_cid == pid - - -def test_tag_object_cid_refs_file_exists(pids, store): - """Test tag object raises exception when trying to add another cid to an - existing pid reference file and that a cid reference file is not created.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) - another_cid = "dou.test.1" - with pytest.raises(FileExistsError): - store.tag_object(pid, another_cid) - - second_cid_hash = store.get_refs_abs_path("cid", another_cid) - assert not os.path.exists(second_cid_hash) - - def test_find_object(pids, store): """Test find object returns the correct content identifier (cid).""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 9051d9e3..9a4fc061 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -1,10 +1,101 @@ -"""Test module for FileHashStore core, utility and supporting methods""" +"""Test module for FileHashStore's reference system to tag stored objects.""" import os import pytest # pylint: disable=W0212 +def test_tag_object(pids, store): + """Test tag object returns boolean.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + object_tagged = store.tag_object(pid, object_metadata.id) + assert object_tagged + + +def test_tag_object_pid_refs_file(pids, store): + """Test tag object creates the pid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + + +def test_tag_object_pid_refs_file_exists(pids, store): + """Test tag object throws exception when pid refs file already exists.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + with pytest.raises(FileExistsError): + store.tag_object(pid, cid) + + +def test_tag_object_pid_refs_file_content(pids, store): + """Test tag object creates the pid reference file contains the correct cid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + with open(pid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + assert pid_refs_cid == object_metadata.id + + +def test_tag_object_cid_refs_file(pids, store): + """Test tag object creates the cid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + +def test_tag_object_cid_refs_file_content(pids, store): + """Test tag object tags cid reference file successfully with pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) + with open(cid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read().strip() + assert pid_refs_cid == pid + + +def test_tag_object_cid_refs_file_exists(pids, store): + """Test tag object raises exception when trying to add another cid to an + existing pid reference file and that a cid reference file is not created.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + another_cid = "dou.test.1" + with pytest.raises(FileExistsError): + store.tag_object(pid, another_cid) + + second_cid_hash = store.get_refs_abs_path("cid", another_cid) + assert not os.path.exists(second_cid_hash) + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): diff --git a/tests/test_filehashstore_stream.py b/tests/test_filehashstore_stream.py index 8cf4a7d0..94e6c412 100644 --- a/tests/test_filehashstore_stream.py +++ b/tests/test_filehashstore_stream.py @@ -1,4 +1,4 @@ -"""Test module for Stream""" +"""Test module for FileHashStore's Stream class.""" import hashlib import io from pathlib import Path diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 68cd195a..953e0fac 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,4 +1,4 @@ -"""Test module for HashStore Module""" +"""Test module for HashStore's HashStoreFactory and ObjectMetadata class.""" import os import pytest from hashstore.hashstore import ObjectMetadata, HashStoreFactory @@ -43,7 +43,7 @@ def test_factory_get_hashstore_unsupported_module(factory): def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): - """Check factory raises exception with store algorithm value that part of the default list""" + """Check factory raises exception with store algorithm value that part of the default list.""" module_name = "hashstore.filehashstore" class_name = "FileHashStore" @@ -59,7 +59,7 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): - """Check factory raises exception with incorrectly formatted algorithm value""" + """Check factory raises exception with incorrectly formatted algorithm value.""" module_name = "hashstore.filehashstore" class_name = "FileHashStore" @@ -75,7 +75,7 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) def test_objectmetadata(): - """Test class returns correct values via dot notation.""" + """Test ObjectMetadata class returns correct values via dot notation.""" ab_id = "hashstoretest" obj_size = 1234 hex_digest_dict = { diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 7d1e01a0..d7ec6324 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -1,4 +1,4 @@ -"""Test module for the Python client (Public API calls only)""" +"""Test module for the Python client (Public API calls only).""" import sys import os from pathlib import Path From 511e3e65b95600327a9dc0d00cb7bcfcd7630268 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 16 Nov 2023 12:36:02 -0800 Subject: [PATCH 053/420] Revise '_update_cid_refs' and add new pytest to throw exception if file is not found --- src/hashstore/filehashstore.py | 11 +++++++---- tests/test_filehashstore_references.py | 9 +++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a4c37dbd..2432d435 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1214,10 +1214,13 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): pid, cid_ref_abs_path, ) - - # TODO: Throw exception if the file doesn't exist. This method should only - # proceed when there is an existing cid refs file. - # TODO: Write test to check for exception thrown + if not os.path.exists(cid_ref_abs_path): + exception_string = ( + f"FileHashStore - update_cid_refs: {cid_ref_abs_path} does not exist." + + f" Cannot write pid: {[pid]}" + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) try: with open(cid_ref_abs_path, "r", encoding="utf8") as f: diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 9a4fc061..6a1afc07 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -171,6 +171,15 @@ def test_update_cid_refs_content_pid_exists(pids, store): store._update_cid_refs(cid_ref_abs_path, pid) +def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): + """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with pytest.raises(FileNotFoundError): + store._update_cid_refs(cid_ref_abs_path, pid) + + def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): From 2c4a1bdc44549320d806e6950890d11be6ba291c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 16 Nov 2023 12:54:11 -0800 Subject: [PATCH 054/420] Rename '_validate_file_size' to '_is_int_and_non_negative' for accuracy --- src/hashstore/filehashstore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2432d435..a3abc1ac 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -424,7 +424,7 @@ def store_object( # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_object") self._validate_data_to_store(data) - self._validate_file_size(expected_object_size) + self._is_int_and_non_negative(expected_object_size) ( additional_algorithm_checked, checksum_algorithm_checked, @@ -497,7 +497,7 @@ def verify_object( self._is_string_none_or_empty( checksum_algorithm, "checksum_algorithm", "verify_object" ) - self._validate_file_size(expected_file_size) + self._is_int_and_non_negative(expected_file_size) if object_metadata is None or not isinstance(ObjectMetadata): raise ValueError( "FileHashStore - verify_object: 'object_metadata' cannot be None." @@ -2049,7 +2049,7 @@ def count(self, entity): # Other Static Methods @staticmethod - def _validate_file_size(file_size): + def _is_int_and_non_negative(file_size): """Checks whether a given argument is an integer and > 0 and throws exception if not. Args: @@ -2058,14 +2058,14 @@ def _validate_file_size(file_size): if file_size is not None: if not isinstance(file_size, int): exception_string = ( - "FileHashStore - _is_file_size_valid: size given must be an integer." + "FileHashStore - _is_int_and_non_negative: size given must be an integer." + f" File size: {file_size}. Arg Type: {type(file_size)}." ) logging.error(exception_string) raise TypeError(exception_string) if file_size < 1 or not isinstance(file_size, int): exception_string = ( - "FileHashStore - _is_file_size_valid: size given must be > 0" + "FileHashStore - _is_int_and_non_negative: size given must be > 0" ) logging.error(exception_string) raise ValueError(exception_string) From 38a1a3cedc67c5f76160569990ea521544f5ecec Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 16 Nov 2023 15:36:52 -0800 Subject: [PATCH 055/420] Update and add new pytests for '_write_cid_refs_file' method --- src/hashstore/filehashstore.py | 41 +++++++++++++++----------- tests/test_filehashstore_references.py | 23 +++++++++++++++ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a3abc1ac..492e4f34 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1167,8 +1167,9 @@ def delete_tmp_file(): return tmp def _write_cid_refs_file(self, path, pid): - """Write the cid reference file in the supplied path. A reference file contains - every pid that references a cid each on its own line. + """Write the cid reference file in the supplied path to a file. A reference file + contains every pid that references a cid each on its own line. This method will + only write into an empty file, and will not write over an an existing one. Args: path (string): Path of file to be written into @@ -1180,10 +1181,14 @@ def _write_cid_refs_file(self, path, pid): path, ) - # TODO: Check that the given path does not contain any data before writing - # This method only writes a new cid refs file and should not overwrite - # an existing one. - # TODO: Write test to confirm exception is thrown when path contains data + if os.path.isfile(path): + if os.path.getsize(path) != 0: + err_msg = ( + "FileHashStore - _write_cid_refs_file: Failed to write cid reference file." + + f" File is not empty: {path} " + ) + logging.error(err_msg) + raise OSError(err_msg) try: with open(path, "w", encoding="utf8") as cid_ref_file: @@ -1257,7 +1262,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): pid (string): Authority-based or persistent identifier of object """ logging.debug( - "FileHashStore - delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", + "FileHashStore - _delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", pid, cid_ref_abs_path, ) @@ -1270,7 +1275,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): if pid not in cid_ref_file_content: err_msg = ( - f"FileHashStore - delete_cid_refs_pid: pid ({pid}) does not exist in" + f"FileHashStore - _delete_cid_refs_pid: pid ({pid}) does not exist in" + f" cid reference file: {cid_ref_abs_path} " ) raise ValueError(err_msg) @@ -1285,7 +1290,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - delete_cid_refs_pid: failed to update reference for cid:" + "FileHashStore - _delete_cid_refs_pid: failed to update reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1301,21 +1306,21 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): boolean: True if deleted, False if not """ logging.debug( - "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", + "FileHashStore - _delete_cid_refs_file: Deleting reference file: %s", cid_ref_abs_path, ) try: if not os.path.exists(cid_ref_abs_path): err_msg = ( - "FileHashStore - delete_cid_refs_file: Cid reference file not found: %s", + "FileHashStore - _delete_cid_refs_file: Cid reference file not found: %s", cid_ref_abs_path, ) logging.error(err_msg) raise FileNotFoundError(err_msg) if os.path.getsize(cid_ref_abs_path) != 0: err_msg = ( - "FileHashStore - delete_cid_refs_file: Failed to delete cid reference file." + "FileHashStore - _delete_cid_refs_file: Failed to delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " ) logging.error(err_msg) @@ -1326,7 +1331,7 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): except Exception as err: exception_string = ( - "FileHashStore - delete_cid_refs_file: failed to delete reference file:" + "FileHashStore - _delete_cid_refs_file: failed to delete reference file:" + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1341,7 +1346,7 @@ def _write_pid_refs_file(self, path, cid): cid (string): Content identifier """ logging.debug( - "FileHashStore - write_pid_refs_file: Writing cid (%s) into file: %s", + "FileHashStore - _write_pid_refs_file: Writing cid (%s) into file: %s", cid, path, ) @@ -1357,7 +1362,7 @@ def _write_pid_refs_file(self, path, cid): except Exception as err: exception_string = ( - f"FileHashStore - write_pid_refs_file: failed to write cid ({cid})" + f"FileHashStore - _write_pid_refs_file: failed to write cid ({cid})" + f" into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1370,14 +1375,14 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): pid_ref_abs_path (string): Absolute path to the pid ref file """ logging.debug( - "FileHashStore - delete_pid_refs_file: Deleting reference file: %s", + "FileHashStore - _delete_pid_refs_file: Deleting reference file: %s", pid_ref_abs_path, ) try: if not os.path.exists(pid_ref_abs_path): err_msg = ( - "FileHashStore - delete_pid_refs_file: pid reference file not found: %s", + "FileHashStore - _delete_pid_refs_file: pid reference file not found: %s", pid_ref_abs_path, ) raise FileNotFoundError(err_msg) @@ -1387,7 +1392,7 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): except Exception as err: exception_string = ( - "FileHashStore - delete_pid_refs_file: failed to delete reference file:" + "FileHashStore - _delete_pid_refs_file: failed to delete reference file:" + f" {pid_ref_abs_path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 6a1afc07..1f4b4b2e 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -120,6 +120,29 @@ def test_write_cid_refs_file_content(pids, store): assert pid == cid_ref_file_pid.strip() +def test_write_cid_refs_file_into_empty_file(pids, store): + """Test that write_cid_reference writes an empty file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + with open(cid_ref_abs_path, "w", encoding="utf8"): + pass + store._write_cid_refs_file(cid_ref_abs_path, pid) + assert os.path.exists(cid_ref_abs_path) + + +def test_write_cid_refs_file_file_not_empty(pids, store): + """Test that write_cid_reference does not overwrite an existing file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + with pytest.raises(OSError): + store._write_cid_refs_file(cid_ref_abs_path, "other_pid") + + def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): From a9cd611411f6a25d75209f32e9a31a5374ed9fa6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 17 Nov 2023 11:47:49 -0800 Subject: [PATCH 056/420] Move info logging statements in finally blocks into try block --- src/hashstore/filehashstore.py | 47 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 492e4f34..5cce2793 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -453,6 +453,10 @@ def store_object( ) if pid is None: object_metadata = self.store_data_only(data) + logging.info( + "FileHashStore - store_object: Successfully stored object for cid: %s", + object_metadata.id, + ) else: object_metadata = self.store_and_validate_data( pid, @@ -462,6 +466,10 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) + logging.info( + "FileHashStore - store_object: Successfully stored object for pid: %s", + pid, + ) finally: # Release pid with self.object_lock: @@ -470,10 +478,6 @@ def store_object( pid, ) self.object_locked_pids.remove(pid) - logging.info( - "FileHashStore - store_object: Successfully stored object for pid: %s", - pid, - ) return object_metadata @@ -499,10 +503,12 @@ def verify_object( ) self._is_int_and_non_negative(expected_file_size) if object_metadata is None or not isinstance(ObjectMetadata): - raise ValueError( + exception_string = ( "FileHashStore - verify_object: 'object_metadata' cannot be None." + " Must be a 'ObjectMetadata' object." ) + logging.error(exception_string) + raise ValueError(exception_string) else: object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size @@ -580,6 +586,12 @@ def tag_object(self, pid, cid): # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review self._validate_references(pid, cid) + + info_msg = ( + f"FileHashStore - tag_object: Successfully tagged cid: {cid}" + + f" with pid: {pid}" + ) + logging.info(info_msg) return True finally: # Release cid @@ -589,8 +601,6 @@ def tag_object(self, pid, cid): cid, ) self.reference_locked_cids.remove(cid) - info_msg = f"FileHashStore - tag_object: Successfully tagged cid: {cid} with pid: {pid}" - logging.info(info_msg) def find_object(self, pid): logging.debug( @@ -643,6 +653,12 @@ def store_metadata(self, pid, metadata, format_id=None): pid, ) metadata_cid = self.put_metadata(metadata, pid, checked_format_id) + + logging.info( + "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", + pid, + ) + return metadata_cid finally: # Release pid with self.metadata_lock: @@ -651,12 +667,6 @@ def store_metadata(self, pid, metadata, format_id=None): pid, ) self.metadata_locked_pids.remove(pid) - logging.info( - "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", - pid, - ) - - return metadata_cid def retrieve_object(self, pid): logging.debug( @@ -745,6 +755,12 @@ def delete_object(self, pid): if cid_refs_deleted: entity = "objects" self.delete(entity, cid) + + info_msg = ( + "FileHashStore - delete_object: Successfully deleted references and/or" + + f" objects associated with pid: {pid}" + ) + logging.info(info_msg) return True finally: # Release cid @@ -754,11 +770,6 @@ def delete_object(self, pid): cid, ) self.reference_locked_cids.remove(cid) - info_msg = ( - "FileHashStore - delete_object: Successfully deleted references and/or" - + f" objects associated with pid: {pid}" - ) - logging.info(info_msg) def delete_metadata(self, pid, format_id=None): logging.debug( From 19695f0ace1aa1875674b196d8709372d69d5ccb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 17 Nov 2023 12:36:42 -0800 Subject: [PATCH 057/420] Fix bug in 'tag_object', add new pytest and revise logging statements --- src/hashstore/filehashstore.py | 23 ++++++++++++++--------- tests/test_filehashstore_references.py | 16 ++++++++++++++++ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5cce2793..60810ffe 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -559,7 +559,12 @@ def tag_object(self, pid, cid): raise FileExistsError(exception_string) elif os.path.exists(cid_ref_abs_path): # Update cid ref files if it already exists - self.update_cid_refs(cid_ref_abs_path, pid) + self._update_cid_refs(cid_ref_abs_path, pid) + logging.info( + "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", + cid, + pid, + ) else: # All ref files begin as tmp files and get moved sequentially at once # Ensure refs tmp folder exists @@ -587,11 +592,11 @@ def tag_object(self, pid, cid): # If there is an issue, client or user will have to manually review self._validate_references(pid, cid) - info_msg = ( - f"FileHashStore - tag_object: Successfully tagged cid: {cid}" - + f" with pid: {pid}" + logging.info( + "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", + cid, + pid, ) - logging.info(info_msg) return True finally: # Release cid @@ -756,11 +761,11 @@ def delete_object(self, pid): entity = "objects" self.delete(entity, cid) - info_msg = ( + info_string = ( "FileHashStore - delete_object: Successfully deleted references and/or" + f" objects associated with pid: {pid}" ) - logging.info(info_msg) + logging.info(info_string) return True finally: # Release cid @@ -809,11 +814,11 @@ def get_hex_digest(self, pid, algorithm): cid_stream = self.open(entity, object_cid) hex_digest = self.computehash(cid_stream, algorithm=algorithm) - info_msg = ( + info_string = ( f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." + f" Hex Digest: {hex_digest}", ) - logging.info(info_msg) + logging.info(info_string) return hex_digest # FileHashStore Core Methods diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 1f4b4b2e..3cfda1a9 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -96,6 +96,22 @@ def test_tag_object_cid_refs_file_exists(pids, store): assert not os.path.exists(second_cid_hash) +def test_tag_object_cid_refs_update(pids, store): + """Test tag object updates a cid reference file that already exists.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + store.tag_object("dou.test.1", cid) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert "dou.test.1" in cid_ref_file_pid + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): From 5c9d22ff7bc1d2529853db83d1439c4a186980de Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 17 Nov 2023 14:04:02 -0800 Subject: [PATCH 058/420] Update HashStore interface docstring for 'store_object' --- src/hashstore/hashstore.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index b1851d0e..606a2496 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -27,9 +27,10 @@ def store_object( """The `store_object` method is responsible for the atomic storage of objects to disk using a given stream. Upon successful storage, the method returns a ObjectMetadata object containing relevant file information, such as the file's id (which can be - used to locate the object on disk), the file's size, and a hex digest map of algorithms + used to locate the object on disk), the file's size, and a hex digest dict of algorithms and checksums. `store_object` also ensures that an object is stored only once by - synchronizing multiple calls and rejecting calls to store duplicate objects. + synchronizing multiple calls and rejecting calls to store duplicate objects. Lastly, + it should call `tag_object` to create the references to allow the object to be found. The file's id is determined by calculating the object's content identifier based on the store's default algorithm, which is also used as the permanent address of the file. @@ -38,17 +39,20 @@ def store_object( and is stored in the `/store_directory/objects/` directory. By default, the hex digest map includes the following hash algorithms: - Default algorithms and hex digests to return: md5, sha1, sha256, sha384, sha512, - which are the most commonly used algorithms in dataset submissions to DataONE - and the Arctic Data Center. If an additional algorithm is provided, the - `store_object` method checks if it is supported and adds it to the map along - with its corresponding hex digest. An algorithm is considered "supported" if it - is recognized as a valid hash algorithm in the `hashlib` library. - - Similarly, if a file size and/or checksum & checksumAlgorithm value are provided, + md5, sha1, sha256, sha384, sha512 - which are the most commonly used algorithms in + dataset submissions to DataONE and the Arctic Data Center. If an additional algorithm + is provided, the `store_object` method checks if it is supported and adds it to the + hex digests dict along with its corresponding hex digest. An algorithm is considered + "supported" if it is recognized as a valid hash algorithm in the `hashlib` library. + + Similarly, if a file size and/or checksum & checksum_algorithm value are provided, `store_object` validates the object to ensure it matches the given arguments before moving the file to its permanent address. + Note, calling `store_object` is a possibility, but should only store the object + without calling `tag_object`. It is the caller's responsibility to finalize the + process by calling `tag_object` after veriftying the correct object is stored. + Args: pid (string): Authority-based identifier. data (mixed): String or path to object. From b8d9715034272f7b4f52e328d9b4c586844279bd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 17 Nov 2023 15:58:19 -0800 Subject: [PATCH 059/420] Initial refactor to 'store_object', fixed bug in 'verify_object' and add new pytests --- src/hashstore/filehashstore.py | 146 ++++++++++++++----------- tests/test_filehashstore.py | 99 +++++++++++++++++ tests/test_filehashstore_interface.py | 9 -- tests/test_filehashstore_references.py | 16 +++ 4 files changed, 197 insertions(+), 73 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 60810ffe..3b7db430 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -418,46 +418,47 @@ def store_object( checksum_algorithm=None, expected_object_size=None, ): - logging.debug( - "FileHashStore - store_object: Request to store object for pid: %s", pid - ) - # Validate input parameters - self._is_string_none_or_empty(pid, "pid", "store_object") - self._validate_data_to_store(data) - self._is_int_and_non_negative(expected_object_size) - ( - additional_algorithm_checked, - checksum_algorithm_checked, - ) = self._validate_algorithms_and_checksum( - additional_algorithm, checksum, checksum_algorithm - ) - - # Wait for the pid to release if it's in use - while pid in self.object_locked_pids: - logging.debug( - "FileHashStore - store_object: %s is currently being stored. Waiting.", - pid, + if pid is None and self._validate_data_to_store(data): + logging.debug("FileHashStore - store_object: Request to store data only.") + object_metadata = self.store_data_only(data) + logging.info( + "FileHashStore - store_object: Successfully stored object for cid: %s", + object_metadata.id, ) - time.sleep(self.time_out_sec) - # Modify object_locked_pids consecutively - with self.object_lock: + else: logging.debug( - "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", - pid, + "FileHashStore - store_object: Request to store object for pid: %s", pid ) - self.object_locked_pids.append(pid) - try: - logging.debug( - "FileHashStore - store_object: Attempting to store object for pid: %s", - pid, + # Validate input parameters + self._is_string_none_or_empty(pid, "pid", "store_object") + self._validate_data_to_store(data) + self._is_int_and_non_negative(expected_object_size) + ( + additional_algorithm_checked, + checksum_algorithm_checked, + ) = self._validate_algorithms_and_checksum( + additional_algorithm, checksum, checksum_algorithm ) - if pid is None: - object_metadata = self.store_data_only(data) - logging.info( - "FileHashStore - store_object: Successfully stored object for cid: %s", - object_metadata.id, + + # Wait for the pid to release if it's in use + while pid in self.object_locked_pids: + logging.debug( + "FileHashStore - store_object: %s is currently being stored. Waiting.", + pid, + ) + time.sleep(self.time_out_sec) + # Modify object_locked_pids consecutively + with self.object_lock: + logging.debug( + "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", + pid, + ) + self.object_locked_pids.append(pid) + try: + logging.debug( + "FileHashStore - store_object: Attempting to store object for pid: %s", + pid, ) - else: object_metadata = self.store_and_validate_data( pid, data, @@ -466,18 +467,19 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) + # TODO: Tag object afterwards and fix pytests logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, ) - finally: - # Release pid - with self.object_lock: - logging.debug( - "FileHashStore - store_object: Removing pid: %s from object_locked_pids.", - pid, - ) - self.object_locked_pids.remove(pid) + finally: + # Release pid + with self.object_lock: + logging.debug( + "FileHashStore - store_object: Removing pid: %s from object_locked_pids.", + pid, + ) + self.object_locked_pids.remove(pid) return object_metadata @@ -502,7 +504,7 @@ def verify_object( checksum_algorithm, "checksum_algorithm", "verify_object" ) self._is_int_and_non_negative(expected_file_size) - if object_metadata is None or not isinstance(ObjectMetadata): + if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): exception_string = ( "FileHashStore - verify_object: 'object_metadata' cannot be None." + " Must be a 'ObjectMetadata' object." @@ -523,6 +525,10 @@ def verify_object( tmp_file_size=object_metadata_file_size, file_size_to_validate=expected_file_size, ) + logging.info( + "FileHashStore - verify_object: object has been validated for cid: %s", + object_metadata.id, + ) def tag_object(self, pid, cid): logging.debug( @@ -1024,12 +1030,12 @@ def _move_and_get_checksums( pid_checksum = self.get_hex_digest(pid, self.algorithm) if pid_checksum == hex_digests.get(self.algorithm): # If the checksums match, return and log warning - warning_msg = ( + exception_string = ( "FileHashStore - _move_and_get_checksums: File moved" + f" successfully but unexpected issue encountered: {exception_string}", ) - logging.warning(warning_msg) - return + logging.error(exception_string) + raise err else: debug_msg = ( "FileHashStore - _move_and_get_checksums: Permanent file" @@ -1513,6 +1519,9 @@ def _validate_data_to_store(self, data): Args: data (string, path, stream): object to validate + + Returns: + boolean: True if valid. """ if ( not isinstance(data, str) @@ -1532,12 +1541,13 @@ def _validate_data_to_store(self, data): ) logging.error(exception_string) raise TypeError(exception_string) + return True def _validate_algorithms_and_checksum( self, additional_algorithm, checksum, checksum_algorithm ): - """Determines whether calling app has supplied the necessary arguments to validate - an object with a checksum value + """Determines whether caller has supplied the necessary arguments to validate + an object with a checksum value. Args: additional_algorithm: value of additional algorithm to calculate @@ -1641,24 +1651,32 @@ def _validate_object( logging.error(exception_string) raise ValueError(exception_string) if checksum_algorithm is not None and checksum is not None: - hex_digest_stored = hex_digests[checksum_algorithm] - if hex_digest_stored != checksum: + if checksum_algorithm not in hex_digests: exception_string = ( - "FileHashStore - _validate_object: Hex digest and checksum" - + f" do not match - file not stored for pid: {pid}. Algorithm:" - + f" {checksum_algorithm}. Checksum provided: {checksum} !=" - + f" HexDigest: {hex_digest_stored}." + f"FileHashStore - _validate_object: checksum_algorithm ({checksum_algorithm})" + + " cannot be found in the hex digests dictionary." ) - if pid is not None: - self.delete(entity, tmp_file_name) - exception_string_for_pid = ( - exception_string + f"Tmp file ({tmp_file_name}) deleted." + logging.error(exception_string) + raise KeyError(exception_string) + else: + hex_digest_stored = hex_digests[checksum_algorithm] + if hex_digest_stored != checksum: + exception_string = ( + "FileHashStore - _validate_object: Hex digest and checksum" + + f" do not match - file not stored for pid: {pid}. Algorithm:" + + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + + f" HexDigest: {hex_digest_stored}." ) - logging.error(exception_string_for_pid) - raise ValueError(exception_string_for_pid) - else: - logging.error(exception_string) - raise ValueError(exception_string) + if pid is not None: + self.delete(entity, tmp_file_name) + exception_string_for_pid = ( + exception_string + f"Tmp file ({tmp_file_name}) deleted." + ) + logging.error(exception_string_for_pid) + raise ValueError(exception_string_for_pid) + else: + logging.error(exception_string) + raise ValueError(exception_string) def _validate_references(self, pid, cid): """Verifies that the supplied pid and pid reference file and content have been diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 59c8b1ac..13ae988b 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -443,6 +443,105 @@ def test_move_and_get_checksums_file_size_raises_error(pids, store): input_stream.close() +def test_validate_object(pids, store): + """Test _validate_object succeeds given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + # pylint: disable=W0212 + store._validate_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + expected_file_size, + expected_file_size, + ) + + +def test_validate_object_incorrect_size(pids, store): + """Test _validate_object throws exception when size is incorrect.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + with pytest.raises(ValueError): + # pylint: disable=W0212 + store._validate_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + 1000, + 2000, + ) + + +def test_validate_object_incorrect_size_with_pid(pids, store): + """Test _validate_object deletes the expected tmp file if obj size does not match + and raises an exception.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + + objects_tmp_folder = store.objects + "/tmp" + # pylint: disable=W0212 + tmp_file = store._mktmpfile(objects_tmp_folder) + assert os.path.isfile(tmp_file.name) + with pytest.raises(ValueError): + store._validate_object( + "Test_Pid", + checksum, + checksum_algorithm, + None, + hex_digests, + tmp_file.name, + 1000, + expected_file_size, + ) + assert not os.path.isfile(tmp_file.name) + + +def test_validate_object_missing_key_in_hex_digests(pids, store): + """Test _validate_object throws exception when algorithm is not found in hex digests.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = "blake2s" + expected_file_size = object_metadata.obj_size + with pytest.raises(KeyError): + store.verify_object( + object_metadata, checksum, checksum_algorithm, expected_file_size + ) + + def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): """Test _write...hex_digests returns correct hex digests for additional algorithm.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 566849da..8e815f92 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -126,15 +126,6 @@ def test_store_object_pid_empty_spaces(store): store.store_object(" ", path) -def test_store_object_pid_none(store): - """Test store object raises error when supplied with 'None' pid.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - with pytest.raises(ValueError): - store.store_object(None, path) - - def test_store_object_data_incorrect_type_none(store): """Test store object raises error when data is 'None'.""" pid = "jtao.1700.1" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 3cfda1a9..b4871877 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -112,6 +112,22 @@ def test_tag_object_cid_refs_update(pids, store): assert "dou.test.1" in cid_ref_file_pid +def test_verify_object(pids, store): + """Test verify object succeeds given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + store.verify_object( + object_metadata, checksum, checksum_algorithm, expected_file_size + ) + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): From 03f2b44dec89a1ff388fd2543938f0ca4678436b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 Nov 2023 10:14:23 -0800 Subject: [PATCH 060/420] Clean up code to improve clarity --- src/hashstore/filehashstore.py | 219 +++++++++++++------------ tests/test_filehashstore.py | 6 +- tests/test_filehashstore_references.py | 32 ++-- tests/test_hashstore.py | 31 ++-- 4 files changed, 148 insertions(+), 140 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3b7db430..7f6dd806 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -80,6 +80,7 @@ def __init__(self, properties=None): checked_properties[property_name] for property_name in self.property_required_keys ] + # TODO: Ensure that store algorithm in properties is compatible with HashStore # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" @@ -418,7 +419,7 @@ def store_object( checksum_algorithm=None, expected_object_size=None, ): - if pid is None and self._validate_data_to_store(data): + if pid is None and self._validate_arg_data(data): logging.debug("FileHashStore - store_object: Request to store data only.") object_metadata = self.store_data_only(data) logging.info( @@ -431,12 +432,12 @@ def store_object( ) # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_object") - self._validate_data_to_store(data) + self._validate_arg_data(data) self._is_int_and_non_negative(expected_object_size) ( additional_algorithm_checked, checksum_algorithm_checked, - ) = self._validate_algorithms_and_checksum( + ) = self._validate_arg_algorithms_and_checksum( additional_algorithm, checksum, checksum_algorithm ) @@ -515,7 +516,7 @@ def verify_object( object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - self._validate_object( + self._validate_arg_object( pid=None, checksum=checksum, checksum_algorithm=checksum_algorithm_checked, @@ -596,7 +597,7 @@ def tag_object(self, pid, cid): shutil.move(cid_tmp_file_path, cid_ref_abs_path) # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review - self._validate_references(pid, cid) + self._verify_hashstore_references(pid, cid) logging.info( "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", @@ -639,8 +640,8 @@ def store_metadata(self, pid, metadata, format_id=None): ) # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_metadata") - checked_format_id = self._validate_format_id(format_id, "store_metadata") - self._validate_metadata_to_store(metadata) + checked_format_id = self._validate_arg_format_id(format_id, "store_metadata") + self._validate_arg_metadata(metadata) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: @@ -713,7 +714,7 @@ def retrieve_metadata(self, pid, format_id=None): pid, ) self._is_string_none_or_empty(pid, "pid", "retrieve_metadata") - checked_format_id = self._validate_format_id(format_id, "retrieve_metadata") + checked_format_id = self._validate_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" metadata_cid = self.computehash(pid + checked_format_id) @@ -788,7 +789,7 @@ def delete_metadata(self, pid, format_id=None): pid, ) self._is_string_none_or_empty(pid, "pid", "delete_metadata") - checked_format_id = self._validate_format_id(format_id, "delete_metadata") + checked_format_id = self._validate_arg_format_id(format_id, "delete_metadata") entity = "metadata" metadata_cid = self.computehash(pid + checked_format_id) @@ -995,7 +996,7 @@ def _move_and_get_checksums( # file and calculate the hex digests because the given checksum could be incorrect. if not os.path.isfile(abs_file_path): # Files are stored once and only once - self._validate_object( + self._validate_arg_object( pid, checksum, checksum_algorithm, @@ -1513,9 +1514,9 @@ def _mktmpmetadata(self, stream): # FileHashStore Utility & Supporting Methods - def _validate_data_to_store(self, data): - """Evaluates a data argument to ensure that it is either a string, path or - stream object before attempting to store it. + def _validate_arg_data(self, data): + """Checks a data argument to ensure that it is either a string, path or stream + object. Args: data (string, path, stream): object to validate @@ -1529,7 +1530,7 @@ def _validate_data_to_store(self, data): and not isinstance(data, io.BufferedIOBase) ): exception_string = ( - "FileHashStore - store_object: Data must be a path, string or buffered" + "FileHashStore - _validate_arg_data: Data must be a path, string or buffered" + f" stream type. Data type supplied: {type(data)}" ) logging.error(exception_string) @@ -1537,13 +1538,13 @@ def _validate_data_to_store(self, data): if isinstance(data, str): if data.replace(" ", "") == "": exception_string = ( - "FileHashStore - store_object: Data string cannot be empty." + "FileHashStore - _validate_arg_data: Data string cannot be empty." ) logging.error(exception_string) raise TypeError(exception_string) return True - def _validate_algorithms_and_checksum( + def _validate_arg_algorithms_and_checksum( self, additional_algorithm, checksum, checksum_algorithm ): """Determines whether caller has supplied the necessary arguments to validate @@ -1553,6 +1554,10 @@ def _validate_algorithms_and_checksum( additional_algorithm: value of additional algorithm to calculate checksum (string): value of checksum checksum_algorithm (string): algorithm of checksum + + Returns: + additional_algorithm_checked (string): hashlib compatible string or 'None' + checksum_algorithm_checked (string): hashlib compatible string or 'None' """ additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: @@ -1575,41 +1580,7 @@ def _validate_algorithms_and_checksum( checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked - def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): - """Create the final list of hash algorithms to calculate - - Args: - additional_algorithm (string) - checksum_algorithm (string) - - Return: - algorithm_list_to_calculate (set): De-duplicated list of hash algorithms - """ - algorithm_list_to_calculate = self.default_algo_list - if checksum_algorithm is not None: - self.clean_algorithm(checksum_algorithm) - if checksum_algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(checksum_algorithm) - if additional_algorithm is not None: - self.clean_algorithm(additional_algorithm) - if additional_algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(additional_algorithm) - - # Remove duplicates - algorithm_list_to_calculate = set(algorithm_list_to_calculate) - return algorithm_list_to_calculate - - def _validate_object( + def _validate_arg_object( self, pid, checksum, @@ -1635,7 +1606,7 @@ def _validate_object( if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: exception_string = ( - "FileHashStore - _validate_object: Object file size calculated: " + "FileHashStore - _validate_arg_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" + f"{file_size_to_validate}." ) @@ -1653,8 +1624,8 @@ def _validate_object( if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: exception_string = ( - f"FileHashStore - _validate_object: checksum_algorithm ({checksum_algorithm})" - + " cannot be found in the hex digests dictionary." + "FileHashStore - _validate_arg_object: checksum_algorithm" + + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." ) logging.error(exception_string) raise KeyError(exception_string) @@ -1662,7 +1633,7 @@ def _validate_object( hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum: exception_string = ( - "FileHashStore - _validate_object: Hex digest and checksum" + "FileHashStore - _validate_arg_object: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + f" HexDigest: {hex_digest_stored}." @@ -1678,7 +1649,57 @@ def _validate_object( logging.error(exception_string) raise ValueError(exception_string) - def _validate_references(self, pid, cid): + def _validate_arg_metadata(self, metadata): + """Evaluates a metadata argument to ensure that it is either a string, path or + stream object before attempting to store it. + + Args: + metadata (string, path, stream): metadata to validate + """ + if isinstance(metadata, str): + if metadata.replace(" ", "") == "": + exception_string = ( + "FileHashStore - store_metadata: Given string path to" + + " metadata cannot be empty." + ) + logging.error(exception_string) + raise TypeError(exception_string) + if ( + not isinstance(metadata, str) + and not isinstance(metadata, Path) + and not isinstance(metadata, io.BufferedIOBase) + ): + exception_string = ( + "FileHashStore - store_metadata: Metadata must be a path or string" + + f" type, data type supplied: {type(metadata)}" + ) + logging.error(exception_string) + raise TypeError(exception_string) + + def _validate_arg_format_id(self, format_id, method): + """Determines the metadata namespace (format_id) to use for storing, + retrieving and deleting metadata. + + Args: + format_id (string): Metadata namespace to review + method (string): Calling method for logging purposes + + Returns: + checked_format_id (string): Valid metadata namespace + """ + checked_format_id = None + if format_id is not None and format_id.replace(" ", "") == "": + exception_string = f"FileHashStore - {method}: Format_id cannot be empty." + logging.error(exception_string) + raise ValueError(exception_string) + elif format_id is None: + # Use default value set by hashstore config + checked_format_id = self.sysmeta_ns + else: + checked_format_id = format_id + return checked_format_id + + def _verify_hashstore_references(self, pid, cid): """Verifies that the supplied pid and pid reference file and content have been written successfully. @@ -1691,14 +1712,14 @@ def _validate_references(self, pid, cid): cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if not os.path.exists(pid_ref_abs_path): exception_string = ( - "FileHashStore - _validate_references: Pid refs file missing: %s", + "FileHashStore - _verify_hashstore_references: Pid refs file missing: %s", pid_ref_abs_path, ) logging.error(exception_string) raise FileNotFoundError(exception_string) if not os.path.exists(cid_ref_abs_path): exception_string = ( - "FileHashStore - _validate_references: Cid refs file missing: %s", + "FileHashStore - _verify_hashstore_references: Cid refs file missing: %s", cid_ref_abs_path, ) logging.error(exception_string) @@ -1708,8 +1729,8 @@ def _validate_references(self, pid, cid): retrieved_cid = self.find_object(pid) if retrieved_cid != cid: exception_string = ( - f"FileHashStore - _validate_references: Pid refs file exists ({pid_ref_abs_path})" - + f" but cid ({cid}) does not match." + "FileHashStore - _verify_hashstore_references: Pid refs file exists" + + f" ({pid_ref_abs_path}) but cid ({cid}) does not match." ) logging.error(exception_string) raise ValueError(exception_string) @@ -1722,61 +1743,45 @@ def _validate_references(self, pid, cid): pid_found = True if not pid_found: exception_string = ( - f"FileHashStore - _validate_references: Cid refs file exists ({cid_ref_abs_path})" - + f" but pid ({pid}) not found." + "FileHashStore - _verify_hashstore_references: Cid refs file exists" + + f" ({cid_ref_abs_path}) but pid ({pid}) not found." ) logging.error(exception_string) raise ValueError(exception_string) - def _validate_metadata_to_store(self, metadata): - """Evaluates a metadata argument to ensure that it is either a string, path or - stream object before attempting to store it. + def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): + """Create the final list of hash algorithms to calculate. Args: - metadata (string, path, stream): metadata to validate + additional_algorithm (string) + checksum_algorithm (string) + + Return: + algorithm_list_to_calculate (set): De-duplicated list of hash algorithms """ - if isinstance(metadata, str): - if metadata.replace(" ", "") == "": - exception_string = ( - "FileHashStore - store_metadata: Given string path to" - + " metadata cannot be empty." + algorithm_list_to_calculate = self.default_algo_list + if checksum_algorithm is not None: + self.clean_algorithm(checksum_algorithm) + if checksum_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." ) - logging.error(exception_string) - raise TypeError(exception_string) - if ( - not isinstance(metadata, str) - and not isinstance(metadata, Path) - and not isinstance(metadata, io.BufferedIOBase) - ): - exception_string = ( - "FileHashStore - store_metadata: Metadata must be a path or string" - + f" type, data type supplied: {type(metadata)}" - ) - logging.error(exception_string) - raise TypeError(exception_string) - - def _validate_format_id(self, format_id, method): - """Determines the metadata namespace (format_id) to use for storing, - retrieving and deleting metadata. - - Args: - format_id (string): Metadata namespace to review - method (string): Calling method for logging purposes + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(checksum_algorithm) + if additional_algorithm is not None: + self.clean_algorithm(additional_algorithm) + if additional_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(additional_algorithm) - Returns: - checked_format_id (string): Valid metadata namespace - """ - checked_format_id = None - if format_id is not None and format_id.replace(" ", "") == "": - exception_string = f"FileHashStore - {method}: Format_id cannot be empty." - logging.error(exception_string) - raise ValueError(exception_string) - elif format_id is None: - # Use default value set by hashstore config - checked_format_id = self.sysmeta_ns - else: - checked_format_id = format_id - return checked_format_id + # Remove duplicates + algorithm_list_to_calculate = set(algorithm_list_to_calculate) + return algorithm_list_to_calculate def clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with @@ -1810,7 +1815,7 @@ def clean_algorithm(self, algorithm_string): return cleaned_string def computehash(self, stream, algorithm=None): - """Compute the hash of a file-like object (or string) using :attr:`algorithm` by + """Compute the hash of a file-like object (or string) using the store algorthm by default or with optional algorithm supported. Args: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 13ae988b..455b39fa 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -456,7 +456,7 @@ def test_validate_object(pids, store): checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size # pylint: disable=W0212 - store._validate_object( + store._validate_arg_object( None, checksum, checksum_algorithm, @@ -481,7 +481,7 @@ def test_validate_object_incorrect_size(pids, store): checksum_algorithm = store.algorithm with pytest.raises(ValueError): # pylint: disable=W0212 - store._validate_object( + store._validate_arg_object( None, checksum, checksum_algorithm, @@ -512,7 +512,7 @@ def test_validate_object_incorrect_size_with_pid(pids, store): tmp_file = store._mktmpfile(objects_tmp_folder) assert os.path.isfile(tmp_file.name) with pytest.raises(ValueError): - store._validate_object( + store._validate_arg_object( "Test_Pid", checksum, checksum_algorithm, diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index b4871877..b009f62e 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -346,38 +346,38 @@ def test_delete_pid_refs_file_file_not_found(pids, store): store._delete_cid_refs_file(pid_ref_abs_path) -def test_validate_references_pid_refs_file_missing(pids, store): - """Test that validate_references throws exception when pid refs file is missing.""" +def test_verify_hashstore_references_pid_refs_file_missing(pids, store): + """Test _verify_hashstore_references throws exception when pid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] with pytest.raises(FileNotFoundError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) -def test_validate_references_pid_refs_incorrect_cid(pids, store): - """Test that validate_references throws exception when pid refs file cid is incorrect.""" +def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): + """Test _verify_hashstore_references throws exception when pid refs file cid is incorrect.""" for pid in pids.keys(): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store._write_pid_refs_file(pid_ref_abs_path, "bad_cid") with pytest.raises(FileNotFoundError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) -def test_validate_references_cid_refs_file_missing(pids, store): - """Test that validate_references throws exception when cid refs file is missing.""" +def test_verify_hashstore_references_cid_refs_file_missing(pids, store): + """Test _verify_hashstore_references throws exception when cid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store._write_pid_refs_file(pid_ref_abs_path, cid) with pytest.raises(FileNotFoundError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) -def test_validate_references_cid_refs_file_missing_pid(pids, store): - """Test that validate_references throws exception when cid refs file does not contain +def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): + """Test _verify_hashstore_references throws exception when cid refs file does not contain the expected pid.""" for pid in pids.keys(): cid = pids[pid]["sha256"] @@ -388,11 +388,13 @@ def test_validate_references_cid_refs_file_missing_pid(pids, store): store._write_pid_refs_file(pid_ref_abs_path, cid) store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") with pytest.raises(ValueError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) -def test_validate_references_cid_refs_file_with_multiple_refs_missing_pid(pids, store): - """Test that validate_references throws exception when cid refs file with multiple +def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pid( + pids, store +): + """Test _verify_hashstore_references throws exception when cid refs file with multiple references does not contain the expected pid.""" for pid in pids.keys(): cid = pids[pid]["sha256"] @@ -410,4 +412,4 @@ def test_validate_references_cid_refs_file_with_multiple_refs_missing_pid(pids, cid_reference_list.append(f"dou.test.{i}") with pytest.raises(ValueError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 953e0fac..b0b57ca8 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -43,7 +43,8 @@ def test_factory_get_hashstore_unsupported_module(factory): def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): - """Check factory raises exception with store algorithm value that part of the default list.""" + """Check factory raises exception with store algorithm value that is not part of + the default list.""" module_name = "hashstore.filehashstore" class_name = "FileHashStore" @@ -58,20 +59,20 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): factory.get_hashstore(module_name, class_name, properties) -def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): - """Check factory raises exception with incorrectly formatted algorithm value.""" - module_name = "hashstore.filehashstore" - class_name = "FileHashStore" - - properties = { - "store_path": os.getcwd() + "/metacat/test", - "store_depth": 3, - "store_width": 2, - "store_algorithm": "sha256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", - } - with pytest.raises(ValueError): - factory.get_hashstore(module_name, class_name, properties) +# def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): +# """Check factory raises exception with incorrectly formatted algorithm value.""" +# module_name = "hashstore.filehashstore" +# class_name = "FileHashStore" + +# properties = { +# "store_path": os.getcwd() + "/metacat/test", +# "store_depth": 3, +# "store_width": 2, +# "store_algorithm": "dou_algo", +# "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", +# } +# with pytest.raises(ValueError): +# factory.get_hashstore(module_name, class_name, properties) def test_objectmetadata(): From 8536f1ea9fafe2ca3c1c9b8d8270cf231bad939d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 Nov 2023 13:16:57 -0800 Subject: [PATCH 061/420] Clean up code, review tests and fix minor bugs and revise docstrings and comments --- src/hashstore/filehashstore.py | 18 +- tests/conftest.py | 3 - tests/test_filehashstore.py | 280 ++++++++++++++++------------- tests/test_filehashstore_stream.py | 2 + tests/test_hashstore.py | 28 +-- tests/test_hashstore_client.py | 39 +++- 6 files changed, 216 insertions(+), 154 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7f6dd806..7dce4653 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -80,7 +80,6 @@ def __init__(self, properties=None): checked_properties[property_name] for property_name in self.property_required_keys ] - # TODO: Ensure that store algorithm in properties is compatible with HashStore # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" @@ -89,8 +88,6 @@ def __init__(self, properties=None): # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") self.root = prop_store_path - if not os.path.exists(self.root): - self.create_path(self.root) self.depth = prop_store_depth self.width = prop_store_width self.sysmeta_ns = prop_store_metadata_namespace @@ -154,7 +151,7 @@ def load_properties(self): # Get hashstore properties hashstore_yaml_dict = {} for key in self.property_required_keys: - if key is not "store_path": + if key != "store_path": hashstore_yaml_dict[key] = yaml_data[key] logging.debug( "FileHashStore - load_properties: Successfully retrieved 'hashstore.yaml' properties." @@ -211,6 +208,10 @@ def write_properties(self, properties): logging.error(exception_string) raise ValueError(exception_string) + # If given store path doesn't exist yet, create it. + if not os.path.exists(self.root): + self.create_path(self.root) + # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( store_depth, @@ -307,7 +308,7 @@ def _verify_hashstore_properties(self, properties, prop_store_path): hashstore_yaml_dict = self.load_properties() for key in self.property_required_keys: # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` - if key is not "store_path": + if key != "store_path": supplied_key = properties[key] if key == "store_depth" or key == "store_width": supplied_key = int(properties[key]) @@ -1071,8 +1072,9 @@ def _write_to_tmp_file_and_get_hex_digests( self, stream, additional_algorithm=None, checksum_algorithm=None ): """Create a named temporary file from a `Stream` object and return its filename - and a dictionary of its algorithms and hex digests. If an additionak and/or checksum - algorithm is provided, it will add the respective hex digest to the dictionary. + and a dictionary of its algorithms and hex digests. If an additional and/or checksum + algorithm is provided, it will add the respective hex digest to the dictionary if + it is supported. Args: stream (io.BufferedReader): Object stream. @@ -2058,7 +2060,7 @@ def get_refs_abs_path(self, ref_type, hash_id): ref_file_abs_path (string): Path to the ref file for the given type and pid """ entity = "refs" - if ref_type is "pid": + if ref_type == "pid": hash_id = self.computehash(hash_id, self.algorithm) ref_file_abs_path = self.build_abs_path(entity, hash_id).replace( "/refs/", f"/refs/{ref_type}/" diff --git a/tests/conftest.py b/tests/conftest.py index 9b25c520..54af3542 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,7 +47,6 @@ def init_pids(): test_pids = { "doi:10.18739/A2901ZH2M": { "file_size_bytes": 39993, - "object_cid": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", "metadata_cid": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7", "md5": "db91c910a3202478c8def1071c54aae5", "sha1": "1fe86e3c8043afa4c70857ca983d740ad8501ccd", @@ -58,7 +57,6 @@ def init_pids(): }, "jtao.1700.1": { "file_size_bytes": 8724, - "object_cid": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", "metadata_cid": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689", "md5": "f4ea2d07db950873462a064937197b0f", "sha1": "3d25436c4490b08a2646e283dada5c60e5c0539d", @@ -69,7 +67,6 @@ def init_pids(): }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { "file_size_bytes": 18699, - "object_cid": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", "metadata_cid": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2", "md5": "e1932fc75ca94de8b64f1d73dc898079", "sha1": "c6d2a69a3f5adaf478ba796c114f57b990cf7ad1", diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 455b39fa..d6ee134f 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1,4 +1,4 @@ -"""Test module for FileHashStore core, utility and supporting methods.""" +"""Test module for FileHashStore init, core, utility and supporting methods.""" import io import os from pathlib import Path @@ -6,6 +6,9 @@ from hashstore.filehashstore import FileHashStore +# Tests for HashStore Configuration and Related Methods + + def test_pids_length(pids): """Ensure test harness pids are present.""" assert len(pids) == 3 @@ -19,14 +22,16 @@ def test_init_directories_created(store): assert os.path.exists(store.metadata) assert os.path.exists(store.metadata + "/tmp") assert os.path.exists(store.refs) + assert os.path.exists(store.refs + "/tmp") assert os.path.exists(store.refs + "/pid") assert os.path.exists(store.refs + "/cid") def test_init_existing_store_incorrect_algorithm_format(store): - """Confirm that exception is thrown when store_algorithm is not a DataONE controlled value""" + """Confirm that exception is thrown when store_algorithm is not a DataONE + controlled value.""" properties = { - "store_path": store.root, + "store_path": store.root + "/incorrect_algo_format", "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", @@ -37,7 +42,7 @@ def test_init_existing_store_incorrect_algorithm_format(store): def test_init_existing_store_correct_algorithm_format(store): - """Confirm second instance of HashStore with DataONE controlled value""" + """Confirm second instance of HashStore with DataONE controlled value.""" properties = { "store_path": store.root, "store_depth": 3, @@ -55,7 +60,8 @@ def test_init_write_properties_hashstore_yaml_exists(store): def test_init_with_existing_hashstore_mismatched_config_depth(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching depth.""" properties = { "store_path": store.root, "store_depth": 1, @@ -68,7 +74,8 @@ def test_init_with_existing_hashstore_mismatched_config_depth(store): def test_init_with_existing_hashstore_mismatched_config_width(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching width.""" properties = { "store_path": store.root, "store_depth": 3, @@ -81,7 +88,8 @@ def test_init_with_existing_hashstore_mismatched_config_width(store): def test_init_with_existing_hashstore_mismatched_config_algo(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching default algorithm.""" properties = { "store_path": store.root, "store_depth": 3, @@ -94,7 +102,8 @@ def test_init_with_existing_hashstore_mismatched_config_algo(store): def test_init_with_existing_hashstore_mismatched_config_metadata_ns(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching default name space.""" properties = { "store_path": store.root, "store_depth": 3, @@ -185,7 +194,7 @@ def test_validate_properties_key_value_is_none(store): def test_validate_properties_incorrect_type(store): - """Confirm exception raised when key missing in properties.""" + """Confirm exception raised when a bad properties value is given.""" properties = "etc/filehashstore/hashstore.yaml" with pytest.raises(ValueError): # pylint: disable=W0212 @@ -205,8 +214,11 @@ def test_set_default_algorithms_missing_yaml(store, pids): store._set_default_algorithms() +# Tests for FileHashStore Core Methods + + def test_store_and_validate_data_files_path(pids, store): - """Test store_and_validate_data objects with path object.""" + """Test store_and_validate_data objects with path object for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -217,7 +229,7 @@ def test_store_and_validate_data_files_path(pids, store): def test_store_and_validate_data_files_string(pids, store): - """Test store_and_validate_data objects with string.""" + """Test store_and_validate_data objects with string for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -228,7 +240,7 @@ def test_store_and_validate_data_files_string(pids, store): def test_store_and_validate_data_files_stream(pids, store): - """Test store_and_validate_data objects with stream.""" + """Test store_and_validate_data objects with stream for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -290,7 +302,7 @@ def test_store_and_validate_data_additional_algorithm(pids, store): def test_store_and_validate_data_with_correct_checksums(pids, store): - """Check store_and_validate_data success with valid checksum and checksum algorithm supplied.""" + """Check store_and_validate_data with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -303,7 +315,7 @@ def test_store_and_validate_data_with_correct_checksums(pids, store): def test_store_and_validate_data_with_incorrect_checksum(pids, store): - """Check store_and_validate_data fails when bad checksum supplied.""" + """Check store_and_validate_data fails when a bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -423,8 +435,8 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): assert store.count(entity) == 3 -def test_move_and_get_checksums_file_size_raises_error(pids, store): - """Test move and get checksum raises error with incorrect file size""" +def test_move_and_get_checksums_incorrect_file_size(pids, store): + """Test move and get checksum raises error with an incorrect file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): with pytest.raises(ValueError): @@ -443,107 +455,8 @@ def test_move_and_get_checksums_file_size_raises_error(pids, store): input_stream.close() -def test_validate_object(pids, store): - """Test _validate_object succeeds given good arguments.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) - hex_digests = object_metadata.hex_digests - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - # pylint: disable=W0212 - store._validate_arg_object( - None, - checksum, - checksum_algorithm, - None, - hex_digests, - None, - expected_file_size, - expected_file_size, - ) - - -def test_validate_object_incorrect_size(pids, store): - """Test _validate_object throws exception when size is incorrect.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) - hex_digests = object_metadata.hex_digests - checksum = hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - with pytest.raises(ValueError): - # pylint: disable=W0212 - store._validate_arg_object( - None, - checksum, - checksum_algorithm, - None, - hex_digests, - None, - 1000, - 2000, - ) - - -def test_validate_object_incorrect_size_with_pid(pids, store): - """Test _validate_object deletes the expected tmp file if obj size does not match - and raises an exception.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) - hex_digests = object_metadata.hex_digests - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - - objects_tmp_folder = store.objects + "/tmp" - # pylint: disable=W0212 - tmp_file = store._mktmpfile(objects_tmp_folder) - assert os.path.isfile(tmp_file.name) - with pytest.raises(ValueError): - store._validate_arg_object( - "Test_Pid", - checksum, - checksum_algorithm, - None, - hex_digests, - tmp_file.name, - 1000, - expected_file_size, - ) - assert not os.path.isfile(tmp_file.name) - - -def test_validate_object_missing_key_in_hex_digests(pids, store): - """Test _validate_object throws exception when algorithm is not found in hex digests.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = "blake2s" - expected_file_size = object_metadata.obj_size - with pytest.raises(KeyError): - store.verify_object( - object_metadata, checksum, checksum_algorithm, expected_file_size - ) - - def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): - """Test _write...hex_digests returns correct hex digests for additional algorithm.""" + """Test _write...hex_digests returns correct hex digests with an additional algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -561,7 +474,8 @@ def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): - """Test _write...hex_digests returns correct hex digests for checksum algorithm.""" + """Test _write...hex_digests returns correct hex digests when given a checksum_algorithm + is provided.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -579,7 +493,8 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(store): - """Test _write...hex_digests returns correct hex digests for checksum algorithm.""" + """Test _write...hex_digests returns correct hex digests when an additional and + checksum algorithm is provided.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -653,7 +568,7 @@ def test_write_to_tmp_file_and_get_hex_digests_hex_digests(pids, store): def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): - """Test _write...hex_digests creates file successfully.""" + """Test _write...hex_digests returns a tmp file successfully.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -665,7 +580,7 @@ def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, store): - """Test _write...hex_digests raises error when bad algorithm supplied.""" + """Test _write...hex_digests raises an exception when an unsupported algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -694,27 +609,27 @@ def test_mktmpfile(store): def test_put_metadata_with_path(pids, store): - """Test put_metadata with path object.""" + """Test put_metadata with path object for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 def test_put_metadata_with_string(pids, store): - """Test_put metadata with string.""" + """Test_put metadata with string for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -726,14 +641,13 @@ def test_put_metadata_cid(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert metadata_cid == pids[pid]["metadata_cid"] def test_mktmpmetadata(pids, store): """Test mktmpmetadata creates tmpFile.""" test_dir = "tests/testdata/" - entity = "metadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -741,7 +655,117 @@ def test_mktmpmetadata(pids, store): # pylint: disable=W0212 tmp_name = store._mktmpmetadata(sys_stream) sys_stream.close() - assert store.exists(entity, tmp_name) + assert os.path.exists(tmp_name) + + +# Tests for FileHashStore Utility & Supporting Methods + + +def test_validate_arg_object(pids, store): + """Test _validate_arg_object succeeds given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + expected_file_size, + expected_file_size, + ) + + +def test_validate_arg_object_incorrect_size(pids, store): + """Test _validate_arg_object throws exception when size is incorrect.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + with pytest.raises(ValueError): + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + 1000, + 2000, + ) + + +def test_validate_arg_object_incorrect_size_with_pid(pids, store): + """Test _validate_arg_object deletes the expected tmp file if obj size does + not match and raises an exception.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + + objects_tmp_folder = store.objects + "/tmp" + # pylint: disable=W0212 + tmp_file = store._mktmpfile(objects_tmp_folder) + assert os.path.isfile(tmp_file.name) + with pytest.raises(ValueError): + store._validate_arg_object( + "Test_Pid", + checksum, + checksum_algorithm, + None, + hex_digests, + tmp_file.name, + 1000, + expected_file_size, + ) + assert not os.path.isfile(tmp_file.name) + + +def test_validate_arg_object_missing_key_in_hex_digests(pids, store): + """Test _validate_arg_object throws exception when algorithm is not found in hex digests.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = "blake2s" + expected_file_size = object_metadata.obj_size + with pytest.raises(KeyError): + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + object_metadata.hex_digests, + None, + expected_file_size, + expected_file_size, + ) def test_clean_algorithm(store): @@ -849,7 +873,7 @@ def test_open_objects(pids, store): def test_delete_by_object_metadata_id(pids, store): - """Check objects are deleted after calling delete with hash address id.""" + """Check objects are deleted after calling delete with object id.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): diff --git a/tests/test_filehashstore_stream.py b/tests/test_filehashstore_stream.py index 94e6c412..29fa4d20 100644 --- a/tests/test_filehashstore_stream.py +++ b/tests/test_filehashstore_stream.py @@ -15,6 +15,7 @@ def test_stream_reads_file(pids): hashobj = hashlib.new("sha256") for data in obj_stream: hashobj.update(data) + obj_stream.close() hex_digest = hashobj.hexdigest() assert pids[pid]["sha256"] == hex_digest @@ -28,6 +29,7 @@ def test_stream_reads_path_object(pids): hashobj = hashlib.new("sha256") for data in obj_stream: hashobj.update(data) + obj_stream.close() hex_digest = hashobj.hexdigest() assert pids[pid]["sha256"] == hex_digest diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index b0b57ca8..e161c967 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -59,20 +59,20 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): factory.get_hashstore(module_name, class_name, properties) -# def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): -# """Check factory raises exception with incorrectly formatted algorithm value.""" -# module_name = "hashstore.filehashstore" -# class_name = "FileHashStore" - -# properties = { -# "store_path": os.getcwd() + "/metacat/test", -# "store_depth": 3, -# "store_width": 2, -# "store_algorithm": "dou_algo", -# "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", -# } -# with pytest.raises(ValueError): -# factory.get_hashstore(module_name, class_name, properties) +def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): + """Check factory raises exception with incorrectly formatted algorithm value.""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + properties = { + "store_path": os.getcwd() + "/metacat/test", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "dou_algo", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + factory.get_hashstore(module_name, class_name, properties) def test_objectmetadata(): diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index d7ec6324..ede33f3a 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -41,6 +41,43 @@ def test_create_hashstore(tmp_path): assert os.path.exists(hashstore_client_python_log) +def test_get_checksum(capsys, store, pids): + """Test calculating a hash via HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + get_checksum_opt = "-getchecksum" + client_pid_arg = f"-pid={pid}" + algo_arg = f"-algo={store.algorithm}" + chs_args = [ + client_module_path, + test_store, + get_checksum_opt, + client_pid_arg, + algo_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + capsystext = capsys.readouterr().out + expected_output = ( + f"guid/pid: {pid}\n" + + f"algorithm: {store.algorithm}\n" + + f"Checksum/Hex Digest: {pids[pid][store.algorithm]}\n" + ) + assert capsystext == expected_output + + def test_store_object(store, pids): """Test storing objects to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" @@ -209,7 +246,7 @@ def test_delete_objects(pids, store): sys.argv = chs_args client.main() - assert not store.exists("objects", pids[pid]["object_cid"]) + assert not store.exists("objects", pids[pid][store.algorithm]) def test_delete_metadata(pids, store): From f993fb98ee4264028a38d7dc386acc11d25083d4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 Nov 2023 13:27:24 -0800 Subject: [PATCH 062/420] Add pytests for 'verify_object' --- src/hashstore/filehashstore.py | 9 ++-- tests/test_filehashstore_references.py | 62 ++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7dce4653..08b38713 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -496,11 +496,6 @@ def verify_object( checksum_algorithm (string): Algorithm of checksum expected_file_size (int): Size of the tmp file """ - # TODO: Write tests - logging.debug( - "FileHashStore - verify_object: Called to verify object with id: %s", - object_metadata.id, - ) self._is_string_none_or_empty(checksum, "checksum", "verify_object") self._is_string_none_or_empty( checksum_algorithm, "checksum_algorithm", "verify_object" @@ -514,6 +509,10 @@ def verify_object( logging.error(exception_string) raise ValueError(exception_string) else: + logging.info( + "FileHashStore - verify_object: Called to verify object with id: %s", + object_metadata.id, + ) object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index b009f62e..507ee509 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -128,6 +128,68 @@ def test_verify_object(pids, store): ) +def test_verify_object_exception_incorrect_object_metadata_type(pids, store): + """Test verify object raises exception when incorrect object is given to + object_metadata arg.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object( + "bad_type", checksum, checksum_algorithm, expected_file_size + ) + + +def test_verify_object_exception_incorrect_size(pids, store): + """Test verify object raises exception when incorrect size is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + with pytest.raises(ValueError): + store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) + + +def test_verify_object_exception_incorrect_checksum(pids, store): + """Test verify object raises exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) + + +def test_verify_object_exception_incorrect_checksum_algo(pids, store): + """Test verify object raises exception when incorrect algorithm is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object(object_metadata, checksum, "md2", expected_file_size) + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): From 687df49b9f1ad9b923be0ecef14c3f061d5a2103 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 Nov 2023 13:50:17 -0800 Subject: [PATCH 063/420] Refactor 'store_object' to also tag object when a pid is supplied and revise all pytests --- src/hashstore/filehashstore.py | 4 +++- src/hashstore/hashstore.py | 15 ++++++++------- tests/test_filehashstore.py | 8 -------- tests/test_filehashstore_interface.py | 15 ++++----------- tests/test_filehashstore_references.py | 16 ++++++++-------- tests/test_hashstore_client.py | 9 +++------ 6 files changed, 26 insertions(+), 41 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 08b38713..adce204e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -421,6 +421,7 @@ def store_object( expected_object_size=None, ): if pid is None and self._validate_arg_data(data): + # If no pid is supplied, store the object only without tagging logging.debug("FileHashStore - store_object: Request to store data only.") object_metadata = self.store_data_only(data) logging.info( @@ -428,6 +429,7 @@ def store_object( object_metadata.id, ) else: + # Else the object will be stored and tagged logging.debug( "FileHashStore - store_object: Request to store object for pid: %s", pid ) @@ -469,7 +471,7 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) - # TODO: Tag object afterwards and fix pytests + self.tag_object(pid, object_metadata.id) logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 606a2496..1fc27ebb 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -28,9 +28,14 @@ def store_object( disk using a given stream. Upon successful storage, the method returns a ObjectMetadata object containing relevant file information, such as the file's id (which can be used to locate the object on disk), the file's size, and a hex digest dict of algorithms - and checksums. `store_object` also ensures that an object is stored only once by - synchronizing multiple calls and rejecting calls to store duplicate objects. Lastly, - it should call `tag_object` to create the references to allow the object to be found. + and checksums. Storing an object with `store_object` also tags an object (creating + references) which allow the object to be discoverable. + + `store_object` also ensures that an object is stored only once by synchronizing multiple + calls and rejecting calls to store duplicate objects. Note, calling `store_object` without + a pid is a possibility, but should only store the object without tagging the object. + It is then the caller's responsibility to finalize the process by calling `tag_object` + after veriftying the correct object is stored. The file's id is determined by calculating the object's content identifier based on the store's default algorithm, which is also used as the permanent address of the file. @@ -49,10 +54,6 @@ def store_object( `store_object` validates the object to ensure it matches the given arguments before moving the file to its permanent address. - Note, calling `store_object` is a possibility, but should only store the object - without calling `tag_object`. It is the caller's responsibility to finalize the - process by calling `tag_object` after veriftying the correct object is stored. - Args: pid (string): Authority-based identifier. data (mixed): String or path to object. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index d6ee134f..ce04ecec 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -667,8 +667,6 @@ def test_validate_arg_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) hex_digests = object_metadata.hex_digests checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm @@ -692,8 +690,6 @@ def test_validate_arg_object_incorrect_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) hex_digests = object_metadata.hex_digests checksum = hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm @@ -718,8 +714,6 @@ def test_validate_arg_object_incorrect_size_with_pid(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) hex_digests = object_metadata.hex_digests checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm @@ -749,8 +743,6 @@ def test_validate_arg_object_missing_key_in_hex_digests(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = "blake2s" expected_file_size = object_metadata.obj_size diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 8e815f92..ec418b9c 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -518,7 +518,6 @@ def test_find_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) cid = store.find_object(pid) assert cid == object_metadata.hex_digests.get("sha256") @@ -703,7 +702,6 @@ def test_retrieve_object(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store.computehash(obj_stream) @@ -801,8 +799,7 @@ def test_delete_objects(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store.count(entity) == 0 @@ -816,8 +813,7 @@ def test_delete_objects_pid_refs_file(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) pid_refs_file_path = store.get_refs_abs_path("pid", pid) @@ -833,9 +829,8 @@ def test_delete_objects_cid_refs_file(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - cid = object_metadata.id - store.tag_object(pid, cid) _metadata_cid = store.store_metadata(pid, syspath, format_id) + cid = object_metadata.id store.delete_object(pid) cid_refs_file_path = store.get_refs_abs_path("cid", cid) assert not os.path.exists(cid_refs_file_path) @@ -851,7 +846,6 @@ def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) cid = object_metadata.id - store.tag_object(pid, cid) cid_refs_abs_path = store.get_refs_abs_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") @@ -937,8 +931,7 @@ def test_get_hex_digest(store): path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 507ee509..e4974bcc 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -10,7 +10,7 @@ def test_tag_object(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) object_tagged = store.tag_object(pid, object_metadata.id) assert object_tagged @@ -20,7 +20,7 @@ def test_tag_object_pid_refs_file(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) pid_refs_file_path = store.get_refs_abs_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -31,7 +31,7 @@ def test_tag_object_pid_refs_file_exists(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, cid) pid_refs_file_path = store.get_refs_abs_path("pid", pid) @@ -47,7 +47,7 @@ def test_tag_object_pid_refs_file_content(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) pid_refs_file_path = store.get_refs_abs_path("pid", pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: @@ -60,7 +60,7 @@ def test_tag_object_cid_refs_file(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, object_metadata.id) cid_refs_file_path = store.get_refs_abs_path("cid", cid) @@ -72,7 +72,7 @@ def test_tag_object_cid_refs_file_content(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) with open(cid_refs_file_path, "r", encoding="utf8") as f: @@ -86,7 +86,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) another_cid = "dou.test.1" with pytest.raises(FileExistsError): @@ -101,7 +101,7 @@ def test_tag_object_cid_refs_update(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, cid) store.tag_object("dou.test.1", cid) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index ede33f3a..1d61fd17 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -47,8 +47,7 @@ def test_get_checksum(capsys, store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" test_store = store.root @@ -145,8 +144,7 @@ def test_retrieve_objects(capsys, pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" test_store = store.root @@ -226,8 +224,7 @@ def test_delete_objects(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" test_store = store.root From d8fe8620ae5d1e9246c9514dfa563baad5e05e9e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 09:50:32 -0800 Subject: [PATCH 064/420] Clean up 'filehashstore' class for doc strings, typos and syntax formatting --- .gitignore | 1 + src/hashstore/filehashstore.py | 25 +++++++++++-------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 09ccd077..c2a663ae 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.idea # Spyder project settings .spyderproject diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index adce204e..f74a6072 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -201,9 +201,8 @@ def write_properties(self, properties): else: exception_string = ( f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" - + " cannot be used as default for HashStore. Must be one of:" - + " MD5, SHA-1, SHA-256, SHA-384, SHA-512 which are DataONE" - + " controlled algorithm values" + f" cannot be used as default for HashStore. Must be one of: {', '.join(accepted_store_algorithms)}" + f" which are DataONE controlled algorithm values" ) logging.error(exception_string) raise ValueError(exception_string) @@ -238,7 +237,6 @@ def _build_hashstore_yaml_string( """Build a YAML string representing the configuration for a HashStore. Args: - store_path (str): Path to the HashStore directory. store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. @@ -374,7 +372,7 @@ def _validate_properties(self, properties): def _set_default_algorithms(self): """Set the default algorithms to calculate when storing objects.""" - def lookup_algo(algo): + def lookup_algo(algo_to_translate): """Translate DataONE controlled algorithms to python hashlib values: https://dataoneorg.github.io/api-documentation/apis/Types.html#Types.ChecksumAlgorithm """ @@ -385,7 +383,7 @@ def lookup_algo(algo): "SHA-384": "sha384", "SHA-512": "sha512", } - return dataone_algo_translation[algo] + return dataone_algo_translation[algo_to_translate] if not os.path.exists(self.hashstore_configuration_yaml): exception_string = ( @@ -490,7 +488,7 @@ def store_object( def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - """Confirms that a object_metadata's content is equal to the given values. + """Confirms that an object_metadata's content is equal to the given values. Args: object_metadata (ObjectMetadata): object_metadata object @@ -1067,7 +1065,7 @@ def _move_and_get_checksums( self.delete(entity, tmp_file_name) raise FileExistsError(exception_string) - return (object_cid, tmp_file_size, hex_digests) + return object_cid, tmp_file_size, hex_digests def _write_to_tmp_file_and_get_hex_digests( self, stream, additional_algorithm=None, checksum_algorithm=None @@ -1079,7 +1077,7 @@ def _write_to_tmp_file_and_get_hex_digests( Args: stream (io.BufferedReader): Object stream. - algorithm (string): Algorithm of additional hex digest to generate + additional_algorithm (string): Algorithm of additional hex digest to generate checksum_algorithm (string): Algorithm of additional checksum algo to generate Returns: @@ -1195,7 +1193,7 @@ def delete_tmp_file(): def _write_cid_refs_file(self, path, pid): """Write the cid reference file in the supplied path to a file. A reference file contains every pid that references a cid each on its own line. This method will - only write into an empty file, and will not write over an an existing one. + only write into an empty file, and will not write over an existing one. Args: path (string): Path of file to be written into @@ -1483,11 +1481,10 @@ def put_metadata(self, metadata, pid, format_id): raise FileNotFoundError(exception_string) def _mktmpmetadata(self, stream): - """Create a named temporary file with `stream` (metadata) and `format_id`. + """Create a named temporary file with `stream` (metadata). Args: stream (io.BufferedReader): Metadata stream. - format_id (string): Format of metadata. Returns: tmp.name (string): Path/name of temporary file created and written into. @@ -1891,8 +1888,8 @@ def compact(items): # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width :]] + [digest[i * self.width: self.width * (i + 1)] for i in range(self.depth)] + + [digest[self.depth * self.width:]] ) return hierarchical_list From caa9d7bf54ebff8058beba3de2d9415d247b44ef Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 09:58:50 -0800 Subject: [PATCH 065/420] Remove redundant method '_validate_arg_metadata' and refactor 'store_metdata()' --- src/hashstore/filehashstore.py | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f74a6072..deb4e407 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -641,7 +641,7 @@ def store_metadata(self, pid, metadata, format_id=None): # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_metadata") checked_format_id = self._validate_arg_format_id(format_id, "store_metadata") - self._validate_arg_metadata(metadata) + self._validate_arg_data(metadata) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: @@ -1649,33 +1649,6 @@ def _validate_arg_object( logging.error(exception_string) raise ValueError(exception_string) - def _validate_arg_metadata(self, metadata): - """Evaluates a metadata argument to ensure that it is either a string, path or - stream object before attempting to store it. - - Args: - metadata (string, path, stream): metadata to validate - """ - if isinstance(metadata, str): - if metadata.replace(" ", "") == "": - exception_string = ( - "FileHashStore - store_metadata: Given string path to" - + " metadata cannot be empty." - ) - logging.error(exception_string) - raise TypeError(exception_string) - if ( - not isinstance(metadata, str) - and not isinstance(metadata, Path) - and not isinstance(metadata, io.BufferedIOBase) - ): - exception_string = ( - "FileHashStore - store_metadata: Metadata must be a path or string" - + f" type, data type supplied: {type(metadata)}" - ) - logging.error(exception_string) - raise TypeError(exception_string) - def _validate_arg_format_id(self, format_id, method): """Determines the metadata namespace (format_id) to use for storing, retrieving and deleting metadata. @@ -1888,8 +1861,8 @@ def compact(items): # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width: self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width:]] + [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] + + [digest[self.depth * self.width :]] ) return hierarchical_list From 5cde868fe81e36c4fd5891342b6ea8ef0444d463 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 10:07:19 -0800 Subject: [PATCH 066/420] Refactor '_is_string_none_or_empty' to call .strip() instead of .replace() to account for spaces, tabs and newline characters --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index deb4e407..58ccad60 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2096,7 +2096,7 @@ def _is_string_none_or_empty(string, arg, method): arg (string): Name of argument to check method (string): Calling method for logging purposes """ - if string is None or string.replace(" ", "") == "": + if string is None or string.strip() == "": exception_string = ( f"FileHashStore - {method}: {arg} cannot be None" + f" or empty, {arg}: {string}." From 9db5a26d8dcf2e725781b674fa982c2b1ebb74b6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 10:17:37 -0800 Subject: [PATCH 067/420] Rename method '_is_string_none_or_empty' to '_validate_string' for accuracy and refactor accordingly --- src/hashstore/filehashstore.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 58ccad60..71ede0bc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -432,7 +432,7 @@ def store_object( "FileHashStore - store_object: Request to store object for pid: %s", pid ) # Validate input parameters - self._is_string_none_or_empty(pid, "pid", "store_object") + self._validate_string(pid, "pid", "store_object") self._validate_arg_data(data) self._is_int_and_non_negative(expected_object_size) ( @@ -496,10 +496,8 @@ def verify_object( checksum_algorithm (string): Algorithm of checksum expected_file_size (int): Size of the tmp file """ - self._is_string_none_or_empty(checksum, "checksum", "verify_object") - self._is_string_none_or_empty( - checksum_algorithm, "checksum_algorithm", "verify_object" - ) + self._validate_string(checksum, "checksum", "verify_object") + self._validate_string(checksum_algorithm, "checksum_algorithm", "verify_object") self._is_int_and_non_negative(expected_file_size) if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): exception_string = ( @@ -537,8 +535,8 @@ def tag_object(self, pid, cid): cid, pid, ) - self._is_string_none_or_empty(pid, "pid", "tag_object") - self._is_string_none_or_empty(cid, "cid", "tag_object") + self._validate_string(pid, "pid", "tag_object") + self._validate_string(cid, "cid", "tag_object") # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -618,7 +616,7 @@ def find_object(self, pid): logging.debug( "FileHashStore - find_object: Request to find object for for pid: %s", pid ) - self._is_string_none_or_empty(pid, "pid", "find_object") + self._validate_string(pid, "pid", "find_object") pid_ref_abs_path = self.get_refs_abs_path("pid", pid) if not os.path.exists(pid_ref_abs_path): @@ -639,7 +637,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) # Validate input parameters - self._is_string_none_or_empty(pid, "pid", "store_metadata") + self._validate_string(pid, "pid", "store_metadata") checked_format_id = self._validate_arg_format_id(format_id, "store_metadata") self._validate_arg_data(metadata) @@ -685,7 +683,7 @@ def retrieve_object(self, pid): "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "retrieve_object") + self._validate_string(pid, "pid", "retrieve_object") object_cid = self.find_object(pid) entity = "objects" @@ -713,7 +711,7 @@ def retrieve_metadata(self, pid, format_id=None): "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "retrieve_metadata") + self._validate_string(pid, "pid", "retrieve_metadata") checked_format_id = self._validate_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" @@ -737,7 +735,7 @@ def delete_object(self, pid): logging.debug( "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) - self._is_string_none_or_empty(pid, "pid", "delete_object") + self._validate_string(pid, "pid", "delete_object") cid = self.find_object(pid) while cid in self.reference_locked_cids: @@ -788,7 +786,7 @@ def delete_metadata(self, pid, format_id=None): "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "delete_metadata") + self._validate_string(pid, "pid", "delete_metadata") checked_format_id = self._validate_arg_format_id(format_id, "delete_metadata") entity = "metadata" @@ -806,8 +804,8 @@ def get_hex_digest(self, pid, algorithm): "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "get_hex_digest") - self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") + self._validate_string(pid, "pid", "get_hex_digest") + self._validate_string(algorithm, "algorithm", "get_hex_digest") entity = "objects" algorithm = self.clean_algorithm(algorithm) @@ -1565,13 +1563,13 @@ def _validate_arg_algorithms_and_checksum( additional_algorithm_checked = self.clean_algorithm(additional_algorithm) checksum_algorithm_checked = None if checksum is not None: - self._is_string_none_or_empty( + self._validate_string( checksum_algorithm, "checksum_algorithm", "validate_checksum_args (store_object)", ) if checksum_algorithm is not None: - self._is_string_none_or_empty( + self._validate_string( checksum, "checksum", "validate_checksum_args (store_object)", @@ -2088,7 +2086,7 @@ def _is_int_and_non_negative(file_size): raise ValueError(exception_string) @staticmethod - def _is_string_none_or_empty(string, arg, method): + def _validate_string(string, arg, method): """Checks whether a string is None or empty and throws an exception if so. Args: From da78588ebecdce087139ce2229401da2a720744b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 10:27:51 -0800 Subject: [PATCH 068/420] Remove redundant instance check in '_is_int_and_non_negative' method --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 71ede0bc..656d02c6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2078,7 +2078,7 @@ def _is_int_and_non_negative(file_size): ) logging.error(exception_string) raise TypeError(exception_string) - if file_size < 1 or not isinstance(file_size, int): + if file_size < 1: exception_string = ( "FileHashStore - _is_int_and_non_negative: size given must be > 0" ) From 2580808d80e0a13c42efb969931629f6cec210d9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 11:21:40 -0800 Subject: [PATCH 069/420] Revise logging message accuracy in '_validate_arg_algorithms_and_checksum' --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 656d02c6..b15dbaae 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1566,13 +1566,13 @@ def _validate_arg_algorithms_and_checksum( self._validate_string( checksum_algorithm, "checksum_algorithm", - "validate_checksum_args (store_object)", + "_validate_arg_algorithms_and_checksum (store_object)", ) if checksum_algorithm is not None: self._validate_string( checksum, "checksum", - "validate_checksum_args (store_object)", + "_validate_arg_algorithms_and_checksum (store_object)", ) # Set checksum_algorithm checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) From 734074771e3860ce4b889a53a4f5ccdf562ad231 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 12:01:53 -0800 Subject: [PATCH 070/420] Add 'verify_object' abstract method to 'HashStore' interface --- src/hashstore/hashstore.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 1fc27ebb..ab071551 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -84,6 +84,20 @@ def tag_object(self, pid, cid): """ raise NotImplementedError() + @abstractmethod + def verify_object( + self, object_metadata, checksum, checksum_algorithm, expected_file_size + ): + """Confirms that an object_metadata's content is equal to the given values. + + Args: + object_metadata (ObjectMetadata): object_metadata object + checksum (string): Value of checksum + checksum_algorithm (string): Algorithm of checksum + expected_file_size (int): Size of the tmp file + """ + raise NotImplementedError() + @abstractmethod def find_object(self, pid): """The `find_object` method checks whether an object referenced by a pid exists From f9a96d7497d73ae7a4b6031714f84016f734ce95 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 12:11:34 -0800 Subject: [PATCH 071/420] Clean up code --- src/hashstore/client.py | 2 +- src/hashstore/filehashstore.py | 12 ++---------- src/hashstore/hashstore.py | 11 ++++++----- tests/test_filehashstore_interface.py | 8 ++++---- tests/test_hashstore_client.py | 2 -- 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c1e2e4b6..dac73fcf 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -392,7 +392,7 @@ def validate_object(self, obj_tuple): obj_db_checksum = obj_tuple[2] with self.hashstore.retrieve_object(pid_guid) as obj_stream: - computed_digest = self.hashstore.computehash(obj_stream, algo) + computed_digest = self.hashstore.get_hex_digest(obj_stream, algo) obj_stream.close() if computed_digest != obj_db_checksum: diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b15dbaae..6e6c11bb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -488,14 +488,6 @@ def store_object( def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - """Confirms that an object_metadata's content is equal to the given values. - - Args: - object_metadata (ObjectMetadata): object_metadata object - checksum (string): Value of checksum - checksum_algorithm (string): Algorithm of checksum - expected_file_size (int): Size of the tmp file - """ self._validate_string(checksum, "checksum", "verify_object") self._validate_string(checksum_algorithm, "checksum_algorithm", "verify_object") self._is_int_and_non_negative(expected_file_size) @@ -1859,8 +1851,8 @@ def compact(items): # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width :]] + [digest[i * self.width: self.width * (i + 1)] for i in range(self.depth)] + + [digest[self.depth * self.width:]] ) return hierarchical_list diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index ab071551..d1ff440c 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from collections import namedtuple import importlib.metadata +import importlib.util class HashStore(ABC): @@ -256,12 +257,12 @@ def get_hashstore(module_name, class_name, properties=None): class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])): - """File address containing file's path on disk and its content hash ID. + """Represents metadata associated with an object. - Args: - ab_id (str): Hash ID (hexdigest) of file contents. - obj_size (bytes): Size of the object - hex_digests (dict, optional): A list of hex digests to validate objects + Attributes: + id (str): A unique identifier for the object (Hash ID, hex digest). + obj_size (bytes): The size of the object in bytes. + hex_digests (list, optional): A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) """ diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index ec418b9c..c06c23d1 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -398,10 +398,10 @@ def test_store_object_duplicates_threads(pids, store): file_exists_error_flag = False - def store_object_wrapper(pid, path): + def store_object_wrapper(obj_pid, obj_path): nonlocal file_exists_error_flag try: - store.store_object(pid, path) # Call store_object inside the thread + store.store_object(obj_pid, obj_path) # Call store_object inside the thread except FileExistsError: file_exists_error_flag = True @@ -444,10 +444,10 @@ def test_store_object_interrupt_process(store): interrupt_flag = False - def store_object_wrapper(pid, path): + def store_object_wrapper(obj_pid, path): print(store.root) while not interrupt_flag: - store.store_object(pid, path) # Call store_object inside the thread + store.store_object(obj_pid, path) # Call store_object inside the thread # Create/start the thread thread = threading.Thread(target=store_object_wrapper, args=(pid, file_path)) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 1d61fd17..96c9ad45 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -82,7 +82,6 @@ def test_store_object(store, pids): client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") client_module_path = f"{client_directory}/client.py" test_store = store.root store_object_opt = "-storeobject" @@ -111,7 +110,6 @@ def test_store_metadata(store, pids): test_dir = "tests/testdata/" namespace = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename client_module_path = f"{client_directory}/client.py" From baf374a5ea86e2583508b8a083b1d04910b7b1ef Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 29 Dec 2023 14:51:50 -0800 Subject: [PATCH 072/420] Refactor 'filehashstore' module to use reStructuredText for sphinx-autodoc --- src/hashstore/filehashstore.py | 480 +++++++++++++++------------------ 1 file changed, 217 insertions(+), 263 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6e6c11bb..cec12da5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -16,7 +16,7 @@ class FileHashStore(HashStore): - """FileHashStore is a content addressable file manager based on Derrick + """FileHashStore is a content-addressable file manager based on Derrick Gilland's 'hashfs' library. It supports the storage of objects on disk using a content identifier to address files. @@ -26,13 +26,12 @@ class FileHashStore(HashStore): store path directory. Properties must always be supplied to ensure consistent usage of FileHashStore once configured. - Args: - properties (dict): A python dictionary with the following keys (and values): - store_path (str): Path to the HashStore directory. - store_depth (int): Depth when sharding an object's hex digest. - store_width (int): Width of directories when sharding an object's hex digest. - store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_metadata_namespace (str): Namespace for the HashStore's system metadata. + :param dict properties: A Python dictionary with the following keys (and values): + - store_path (str): Path to the HashStore directory. + - store_depth (int): Depth when sharding an object's hex digest. + - store_width (int): Width of directories when sharding an object's hex digest. + - store_algorithm (str): Hash algorithm used for calculating the object's hex digest. + - store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ # Property (hashstore configuration) requirements @@ -130,12 +129,12 @@ def __init__(self, properties=None): def load_properties(self): """Get and return the contents of the current HashStore configuration. - Returns: - hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): - store_depth (int): Depth when sharding an object's hex digest. - store_width (int): Width of directories when sharding an object's hex digest. - store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_metadata_namespace (str): Namespace for the HashStore's system metadata. + :return: HashStore properties with the following keys (and values): + - ``store_depth`` (int): Depth when sharding an object's hex digest. + - ``store_width`` (int): Width of directories when sharding an object's hex digest. + - ``store_algorithm`` (str): Hash algo used for calculating the object's hex digest. + - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. + :rtype: dict """ if not os.path.exists(self.hashstore_configuration_yaml): exception_string = ( @@ -162,12 +161,12 @@ def write_properties(self, properties): """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. - Args: - properties (dict): A python dictionary with the following keys (and values): - store_depth (int): Depth when sharding an object's hex digest. - store_width (int): Width of directories when sharding an object's hex digest. - store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_metadata_namespace (str): Namespace for the HashStore's system metadata. + :param properties: A Python dictionary with the following keys (and values): + - ``store_depth`` (int): Depth when sharding an object's hex digest. + - ``store_width`` (int): Width of directories when sharding an object's hex digest. + - ``store_algorithm`` (str): Hash algo used for calculating the object's hex digest. + - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. + :type properties: dict """ # If hashstore.yaml already exists, must throw exception and proceed with caution if os.path.exists(self.hashstore_configuration_yaml): @@ -201,7 +200,8 @@ def write_properties(self, properties): else: exception_string = ( f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" - f" cannot be used as default for HashStore. Must be one of: {', '.join(accepted_store_algorithms)}" + f" cannot be used as default for HashStore. Must be one of: " + + f"{', '.join(accepted_store_algorithms)}" f" which are DataONE controlled algorithm values" ) logging.error(exception_string) @@ -236,15 +236,13 @@ def _build_hashstore_yaml_string( ): """Build a YAML string representing the configuration for a HashStore. - Args: - store_depth (int): Depth when sharding an object's hex digest. - store_width (int): Width of directories when sharding an object's hex digest. - store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_metadata_namespace (str): Namespace for the HashStore's system metadata. + :param int store_depth: Depth when sharding an object's hex digest. + :param int store_width: Width of directories when sharding an object's hex digest. + :param str store_algorithm: Hash algorithm used for calculating the object's hex digest. + :param str store_metadata_namespace: Namespace for the HashStore's system metadata. - Returns: - hashstore_configuration_yaml (str): A YAML string representing the configuration for - a HashStore. + :return: A YAML string representing the configuration for a HashStore. + :rtype: str """ hashstore_configuration_yaml = f""" # Default configuration variables for HashStore @@ -293,9 +291,8 @@ def _verify_hashstore_properties(self, properties, prop_store_path): look to see if any directories/files exist in the given store path and throw an exception if any file or directory is found. - Args: - properties (dict): HashStore properties - prop_store_path (string): Store path to check + :param dict properties: HashStore properties. + :param str prop_store_path: Store path to check. """ if os.path.exists(self.hashstore_configuration_yaml): logging.debug( @@ -334,15 +331,13 @@ def _validate_properties(self, properties): """Validate a properties dictionary by checking if it contains all the required keys and non-None values. - Args: - properties (dict): Dictionary containing filehashstore properties. + :param dict properties: Dictionary containing filehashstore properties. - Raises: - KeyError: If key is missing from the required keys. - ValueError: If value is missing for a required key. + :raises KeyError: If key is missing from the required keys. + :raises ValueError: If value is missing for a required key. - Returns: - properties (dict): The given properties object (that has been validated). + :return: The given properties object (that has been validated). + :rtype: dict """ if not isinstance(properties, dict): exception_string = ( @@ -830,24 +825,21 @@ def store_and_validate_data( checksum_algorithm=None, file_size_to_validate=None, ): - """Store contents of `file` on disk using, validate the object's parameters if - provided and tag/reference the object. - - Args: - pid (string): Authority-based identifier. \n - file (mixed): Readable object or path to file. \n - extension (str, optional): Optional extension to append to file - when saving. \n - additional_algorithm (str, optional): Optional algorithm value to include - when returning hex digests. \n - checksum (str, optional): Optional checksum to validate object - against hex digest before moving to permanent location. \n - checksum_algorithm (str, optional): Algorithm value of given checksum. \n - file_size_to_validate (bytes, optional): Expected size of object - - Returns: - object_metadata (ObjectMetadata): object that contains the object id, - object file size and hex digest dictionary. + """Store contents of `file` on disk, validate the object's parameters if provided, + and tag/reference the object. + + :param str pid: Authority-based identifier. + :param mixed file: Readable object or path to file. + :param str extension: Optional extension to append to file when saving. + :param str additional_algorithm: Optional algorithm value to include when returning + hex digests. + :param str checksum: Optional checksum to validate object against hex digest before moving + to permanent location. + :param str checksum_algorithm: Algorithm value of the given checksum. + :param bytes file_size_to_validate: Expected size of the object. + + :return: ObjectMetadata - object that contains the object id, object file size, + and hex digest dictionary. """ stream = Stream(file) @@ -877,20 +869,17 @@ def store_and_validate_data( return object_metadata def store_data_only(self, data): - """Store an object to HashStore and return the id and a hex digest + """Store an object to HashStore and return the ID and a hex digest dictionary of the default algorithms. This method does not validate the - object and writes directly to /objects after the hex digests are calculated. + object and writes directly to `/objects` after the hex digests are calculated. - Args: - data (mixed): String or path to object. + :param mixed data: String or path to object. - Raises: - IOError: If object fails to store - FileExistsError: If file already exists + :raises IOError: If the object fails to store. + :raises FileExistsError: If the file already exists. - Returns: - object_metadata (ObjectMetadata): object that contains the object id, - object file size and hex digest dictionary. + :return: ObjectMetadata - object that contains the object ID, object file + size, and hex digest dictionary. """ logging.debug( "FileHashStore - store_object: Request to store data object only." @@ -941,24 +930,22 @@ def _move_and_get_checksums( extension appended. The copy process uses a temporary file to store the initial contents and returns a dictionary of algorithms and their hex digest values. If the file already exists, the method will immediately - raise an exception. If an algorithm and checksum is provided, it will proceed to + raise an exception. If an algorithm and checksum are provided, it will proceed to validate the object (and delete the tmpFile if the hex digest stored does not match what is provided). - Args: - pid (string): authority-based identifier. \n - stream (io.BufferedReader): object stream. \n - extension (str, optional): Optional extension to append to file - when saving. \n - additional_algorithm (str, optional): Optional algorithm value to include - when returning hex digests. \n - checksum (str, optional): Optional checksum to validate object - against hex digest before moving to permanent location. \n - checksum_algorithm (str, optional): Algorithm value of given checksum. \n - file_size_to_validate (bytes, optional): Expected size of object - - Returns: - object_metadata (tuple): object id, object file size and hex digest dictionary. + :param str pid: Authority-based identifier. + :param io.BufferedReader stream: Object stream. + :param str extension: Optional extension to append to the file + when saving. + :param str additional_algorithm: Optional algorithm value to include + when returning hex digests. + :param str checksum: Optional checksum to validate the object + against hex digest before moving to the permanent location. + :param str checksum_algorithm: Algorithm value of the given checksum. + :param bytes file_size_to_validate: Expected size of the object. + + :return: tuple - Object ID, object file size, and hex digest dictionary. """ debug_msg = ( "FileHashStore - _move_and_get_checksums: Creating temp" @@ -1065,15 +1052,13 @@ def _write_to_tmp_file_and_get_hex_digests( algorithm is provided, it will add the respective hex digest to the dictionary if it is supported. - Args: - stream (io.BufferedReader): Object stream. - additional_algorithm (string): Algorithm of additional hex digest to generate - checksum_algorithm (string): Algorithm of additional checksum algo to generate + :param io.BufferedReader stream: Object stream. + :param str additional_algorithm: Algorithm of additional hex digest to generate. + :param str checksum_algorithm: Algorithm of additional checksum algo to generate. - Returns: - hex_digest_dict, tmp.name (tuple pack): - hex_digest_dict (dictionary): Algorithms and their hex digests. - tmp.name: Name of temporary file created and written into. + :return: tuple - hex_digest_dict, tmp.name + - hex_digest_dict (dict): Algorithms and their hex digests. + - tmp.name (str): Name of the temporary file created and written into. """ # Review additional hash object to digest and create new list algorithm_list_to_calculate = self._refine_algorithm_list( @@ -1156,11 +1141,9 @@ def _write_to_tmp_file_and_get_hex_digests( def _mktmpfile(self, path): """Create a temporary file at the given path ready to be written. - Args: - path (string): Path to the file location + :param str path: Path to the file location. - Returns: - tmp (file object): object with file-like interface + :return: file object - object with a file-like interface. """ tmp = NamedTemporaryFile(dir=path, delete=False) @@ -1181,13 +1164,12 @@ def delete_tmp_file(): return tmp def _write_cid_refs_file(self, path, pid): - """Write the cid reference file in the supplied path to a file. A reference file - contains every pid that references a cid each on its own line. This method will - only write into an empty file, and will not write over an existing one. + """Write the CID reference file in the supplied path to a file. A reference file + contains every PID that references a CID, each on its own line. This method will + only write into an empty file and will not overwrite an existing one. - Args: - path (string): Path of file to be written into - pid (string): Authority-based or persistent identifier of object + :param str path: Path of the file to be written into. + :param str pid: Authority-based or persistent identifier of the object. """ logging.debug( "FileHashStore - write_cid_refs_file: Writing pid (%s) into file: %s", @@ -1222,11 +1204,10 @@ def _write_cid_refs_file(self, path, pid): raise err def _update_cid_refs(self, cid_ref_abs_path, pid): - """Update an existing cid reference file with the given pid. + """Update an existing CID reference file with the given PID. - Args: - cid_ref_abs_path (string): Absolute path to the cid ref file - pid (string): Authority-based or persistent identifier of object + :param str cid_ref_abs_path: Absolute path to the CID reference file. + :param str pid: Authority-based or persistent identifier of the object. """ logging.debug( "FileHashStore - update_cid_refs: Adding pid (%s) into cid reference file: %s", @@ -1269,11 +1250,10 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): raise err def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): - """Delete a pid from a cid reference file. + """Delete a PID from a CID reference file. - Args: - cid_ref_abs_path (string): Absolute path to the cid ref file - pid (string): Authority-based or persistent identifier of object + :param str cid_ref_abs_path: Absolute path to the CID reference file. + :param str pid: Authority-based or persistent identifier of the object. """ logging.debug( "FileHashStore - _delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", @@ -1311,13 +1291,12 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): raise err def _delete_cid_refs_file(self, cid_ref_abs_path): - """Delete a cid reference file. There must be no references remaining. + """Delete a CID reference file. There must be no references remaining. - Args: - cid_ref_abs_path (string): Absolute path to the cid ref file + :param str cid_ref_abs_path: Absolute path to the CID reference file. - Returns: - boolean: True if deleted, False if not + :return: True if deleted, False if not. + :rtype: bool """ logging.debug( "FileHashStore - _delete_cid_refs_file: Deleting reference file: %s", @@ -1352,12 +1331,11 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): raise err def _write_pid_refs_file(self, path, cid): - """Write the pid reference file in the supplied path for the given cid (content - identifier). A reference file for a pid contains the cid that it references. + """Write the PID reference file in the supplied path for the given CID (content + identifier). A reference file for a PID contains the CID that it references. - Args: - path (string): Path of file to be written into - cid (string): Content identifier + :param str path: Path of the file to be written into. + :param str cid: Content identifier. """ logging.debug( "FileHashStore - _write_pid_refs_file: Writing cid (%s) into file: %s", @@ -1383,10 +1361,9 @@ def _write_pid_refs_file(self, path, cid): raise err def _delete_pid_refs_file(self, pid_ref_abs_path): - """Delete a pid reference file. + """Delete a PID reference file. - Args: - pid_ref_abs_path (string): Absolute path to the pid ref file + :param str pid_ref_abs_path: Absolute path to the PID reference file. """ logging.debug( "FileHashStore - _delete_pid_refs_file: Deleting reference file: %s", @@ -1414,15 +1391,14 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the - given pid and format_id as the permanent address. + given PID and format ID as the permanent address. - Args: - pid (string): Authority-based identifier. - format_id (string): Metadata format. - metadata (mixed): String or path to metadata document. + :param str pid: Authority-based identifier. + :param str format_id: Metadata format. + :param mixed metadata: String or path to metadata document. - Returns: - metadata_cid (string): Address of the metadata document. + :return: Address of the metadata document. + :rtype: str """ logging.debug( "FileHashStore - put_metadata: Request to put metadata for pid: %s", pid @@ -1473,11 +1449,10 @@ def put_metadata(self, metadata, pid, format_id): def _mktmpmetadata(self, stream): """Create a named temporary file with `stream` (metadata). - Args: - stream (io.BufferedReader): Metadata stream. + :param io.BufferedReader stream: Metadata stream. - Returns: - tmp.name (string): Path/name of temporary file created and written into. + :return: Path/name of temporary file created and written into. + :rtype: str """ # Create temporary file in .../{store_path}/tmp tmp_root_path = self.get_store_path("metadata") / "tmp" @@ -1505,14 +1480,14 @@ def _mktmpmetadata(self, stream): # FileHashStore Utility & Supporting Methods def _validate_arg_data(self, data): - """Checks a data argument to ensure that it is either a string, path or stream + """Checks a data argument to ensure that it is either a string, path, or stream object. - Args: - data (string, path, stream): object to validate + :param data: Object to validate (string, path, or stream). + :type data: str, os.PathLike, io.BufferedReader - Returns: - boolean: True if valid. + :return: True if valid. + :rtype: bool """ if ( not isinstance(data, str) @@ -1537,17 +1512,19 @@ def _validate_arg_data(self, data): def _validate_arg_algorithms_and_checksum( self, additional_algorithm, checksum, checksum_algorithm ): - """Determines whether caller has supplied the necessary arguments to validate + """Determines whether the caller has supplied the necessary arguments to validate an object with a checksum value. - Args: - additional_algorithm: value of additional algorithm to calculate - checksum (string): value of checksum - checksum_algorithm (string): algorithm of checksum + :param additional_algorithm: Value of the additional algorithm to calculate. + :type additional_algorithm: str or None + :param checksum: Value of the checksum. + :type checksum: str or None + :param checksum_algorithm: Algorithm of the checksum. + :type checksum_algorithm: str or None - Returns: - additional_algorithm_checked (string): hashlib compatible string or 'None' - checksum_algorithm_checked (string): hashlib compatible string or 'None' + :return: Hashlib-compatible string or 'None' for additional_algorithm and + checksum_algorithm. + :rtype: str """ additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: @@ -1581,17 +1558,16 @@ def _validate_arg_object( tmp_file_size, file_size_to_validate, ): - """Evaluates an object's integrity and throws exception if there is a mismatch. - - Args: - pid (string): For logging purposes - checksum (string): Value of checksum to check - checksum_algorithm (string): Algorithm of checksum - entity (string): Type of object ('objects' or 'metadata') - hex_digests (dictionary): Dictionary of hex digests to parse - tmp_file_name (string): Name of tmp file - tmp_file_size (int): Size of the tmp file - file_size_to_validate (int): Expected size of the object + """Evaluates an object's integrity and raises an exception if there is a mismatch. + + :param str pid: For logging purposes. + :param str checksum: Value of the checksum to check. + :param str checksum_algorithm: Algorithm of the checksum. + :param str entity: Type of object ('objects' or 'metadata'). + :param dict hex_digests: Dictionary of hex digests to parse. + :param str tmp_file_name: Name of the temporary file. + :param int tmp_file_size: Size of the temporary file. + :param int file_size_to_validate: Expected size of the object. """ if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: @@ -1641,14 +1617,13 @@ def _validate_arg_object( def _validate_arg_format_id(self, format_id, method): """Determines the metadata namespace (format_id) to use for storing, - retrieving and deleting metadata. + retrieving, and deleting metadata. - Args: - format_id (string): Metadata namespace to review - method (string): Calling method for logging purposes + :param str format_id: Metadata namespace to review. + :param str method: Calling method for logging purposes. - Returns: - checked_format_id (string): Valid metadata namespace + :return: Valid metadata namespace. + :rtype: str """ checked_format_id = None if format_id is not None and format_id.replace(" ", "") == "": @@ -1666,9 +1641,8 @@ def _verify_hashstore_references(self, pid, cid): """Verifies that the supplied pid and pid reference file and content have been written successfully. - Args: - pid (string): Authority-based or persistent identifier - cid (string): Content identifier + :param str pid: Authority-based or persistent identifier. + :param str cid: Content identifier. """ # Check that reference files were created pid_ref_abs_path = self.get_refs_abs_path("pid", pid) @@ -1715,12 +1689,11 @@ def _verify_hashstore_references(self, pid, cid): def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): """Create the final list of hash algorithms to calculate. - Args: - additional_algorithm (string) - checksum_algorithm (string) + :param str additional_algorithm: Additional algorithm. + :param str checksum_algorithm: Checksum algorithm. - Return: - algorithm_list_to_calculate (set): De-duplicated list of hash algorithms + :return: De-duplicated list of hash algorithms. + :rtype: set """ algorithm_list_to_calculate = self.default_algo_list if checksum_algorithm is not None: @@ -1748,13 +1721,12 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): def clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with - the python hashlib library. + the Python `hashlib` library. - Args: - algorithm_string (string): Algorithm to validate. + :param str algorithm_string: Algorithm to validate. - Returns: - cleaned_string (string): `hashlib` supported algorithm string. + :return: `hashlib` supported algorithm string. + :rtype: str """ count = 0 for char in algorithm_string: @@ -1778,16 +1750,15 @@ def clean_algorithm(self, algorithm_string): return cleaned_string def computehash(self, stream, algorithm=None): - """Compute the hash of a file-like object (or string) using the store algorthm by - default or with optional algorithm supported. + """Compute the hash of a file-like object (or string) using the store algorithm by + default or with an optional supported algorithm. - Args: - stream (mixed): A buffered stream (io.BufferedReader) of an object. A string is - also acceptable as they are a sequence of characters (Python only).\n - algorithm (string): Algorithm of hex digest to generate. + :param mixed stream: A buffered stream (`io.BufferedReader`) of an object. A string is + also acceptable as they are a sequence of characters (Python only). + :param str algorithm: Algorithm of hex digest to generate. - Returns: - hex_digest (string): Hex digest. + :return: Hex digest. + :rtype: str """ if algorithm is None: hashobj = hashlib.new(self.algorithm) @@ -1802,8 +1773,7 @@ def computehash(self, stream, algorithm=None): def get_store_path(self, entity): """Return a path object of the root directory of the store. - Args: - entity (str): Desired entity type: "objects" or "metadata" + :param str entity: Desired entity type: "objects" or "metadata" """ if entity == "objects": return Path(self.objects) @@ -1819,29 +1789,26 @@ def get_store_path(self, entity): def exists(self, entity, file): """Check whether a given file id or path exists on disk. - Args: - entity (str): Desired entity type (ex. "objects", "metadata"). \n - file (str): The name of the file to check. - - Returns: - file_exists (bool): True if the file exists. + :param str entity: Desired entity type (e.g., "objects", "metadata"). + :param str file: The name of the file to check. + :return: True if the file exists. + :rtype: bool """ file_exists = bool(self.get_real_path(entity, file)) return file_exists def shard(self, digest): """Generates a list given a digest of `self.depth` number of tokens with width - `self.width` from the first part of the digest plus the remainder. + `self.width` from the first part of the digest plus the remainder. Example: ['0d', '55', '5e', 'd77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e'] - Args: - digest (str): The string to be divided into tokens. + :param str digest: The string to be divided into tokens. - Returns: - hierarchical_list (list): A list containing the tokens of fixed width. + :return: A list containing the tokens of fixed width. + :rtype: list """ def compact(items): @@ -1851,8 +1818,8 @@ def compact(items): # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width: self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width:]] + [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] + + [digest[self.depth * self.width :]] ) return hierarchical_list @@ -1861,13 +1828,12 @@ def open(self, entity, file, mode="rb"): """Return open buffer object from given id or path. Caller is responsible for closing the stream. - Args: - entity (str): Desired entity type (ex. "objects", "metadata"). \n - file (str): Address ID or path of file. \n - mode (str, optional): Mode to open file in. Defaults to 'rb'. + :param str entity: Desired entity type (ex. "objects", "metadata"). + :param str file: Address ID or path of file. + :param str mode: Mode to open file in. Defaults to 'rb'. - Returns: - buffer (io.BufferedReader): An `io` stream dependent on the `mode`. + :return: An `io` stream dependent on the `mode`. + :rtype: io.BufferedReader """ realpath = self.get_real_path(entity, file) if realpath is None: @@ -1882,9 +1848,8 @@ def delete(self, entity, file): """Delete file using id or path. Remove any empty directories after deleting. No exception is raised if file doesn't exist. - Args: - entity (str): Desired entity type (ex. "objects", "metadata"). \n - file (str): Address ID or path of file. + :param str entity: Desired entity type (ex. "objects", "metadata"). + :param str file: Address ID or path of file. """ realpath = self.get_real_path(entity, file) if realpath is None: @@ -1902,8 +1867,7 @@ def _remove_empty(self, subpath): proceeding "up" through directory tree until reaching the `root` folder. - Args: - subpath (str, path): Name of directory. + :param str subpath: Name of directory. """ # Don't attempt to remove any folders if subpath is not a # subdirectory of the root directory. @@ -1919,11 +1883,10 @@ def _remove_empty(self, subpath): def _has_subdir(self, path): """Return whether `path` is a subdirectory of the `root` directory. - Args: - path (str, path): Name of path. + :param str path: Name of path. - Returns: - is_subdir (boolean): `True` if subdirectory. + :return: `True` if subdirectory. + :rtype: bool """ # Append os.sep so that paths like /usr/var2/log doesn't match /usr/var. root_path = os.path.realpath(self.root) + os.sep @@ -1934,11 +1897,8 @@ def _has_subdir(self, path): def create_path(self, path): """Physically create the folder path (and all intermediate ones) on disk. - Args: - path (str): The path to create. - - Raises: - AssertionError (exception): If the path already exists but is not a directory. + :param str path: The path to create. + :raises AssertionError: If the path already exists but is not a directory. """ try: os.makedirs(path, self.dmode) @@ -1946,17 +1906,16 @@ def create_path(self, path): assert os.path.isdir(path), f"expected {path} to be a directory" def get_real_path(self, entity, file): - """Attempt to determine the real path of a file id or path through + """Attempt to determine the real path of a file ID or path through successive checking of candidate paths. If the real path is stored with an extension, the path is considered a match if the basename matches - the expected file path of the id. + the expected file path of the ID. - Args: - entity (str): desired entity type (ex. "objects", "metadata"). \n - file (string): Name of file. + :param str entity: Desired entity type (ex. "objects", "metadata"). + :param str file: Name of the file. - Returns: - exists (boolean): Whether file is found or not. + :return: Whether the file is found or not. + :rtype: bool """ # Check for absolute path. if os.path.isfile(file): @@ -1985,15 +1944,14 @@ def get_real_path(self, entity, file): return None def build_abs_path(self, entity, hash_id, extension=""): - """Build the absolute file path for a given hash id with an optional file extension. + """Build the absolute file path for a given hash ID with an optional file extension. - Args: - entity (str): Desired entity type (ex. "objects", "metadata"). \n - hash_id (str): A hash id to build a file path for. \n - extension (str): An optional file extension to append to the file path. + :param str entity: Desired entity type (ex. "objects", "metadata"). + :param str hash_id: A hash ID to build a file path for. + :param str extension: An optional file extension to append to the file path. - Returns: - absolute_path (str): An absolute file path for the specified hash id. + :return: An absolute file path for the specified hash ID. + :rtype: str """ paths = self.shard(hash_id) root_dir = self.get_store_path(entity) @@ -2007,18 +1965,18 @@ def build_abs_path(self, entity, hash_id, extension=""): return absolute_path def get_refs_abs_path(self, ref_type, hash_id): - """Get the absolute path to the reference file for the given ref_type. If a - 'pid' is provided, this method will calculate the pid's hash based on the store - algorithm, and return the expected address of the pid reference file. If a - 'cid' is provided, this method will return the expected address by sharding the - cid based on HashStore's configuration. - - Args: - ref_type (string): 'pid' or 'cid' - hash_id (string): Authority-based, persistent or hash identifier - - Returns: - ref_file_abs_path (string): Path to the ref file for the given type and pid + """Get the absolute path to the reference file for the given ref_type. + + If a 'pid' is provided, this method will calculate the pid's hash based on the store + algorithm and return the expected address of the pid reference file. If a 'cid' is + provided, this method will return the expected address by sharding the cid based on + HashStore's configuration. + + :param str ref_type: 'pid' or 'cid' + :param str hash_id: Authority-based, persistent, or hash identifier + + :return: Path to the reference file for the given type and ID. + :rtype: str """ entity = "refs" if ref_type == "pid": @@ -2029,13 +1987,12 @@ def get_refs_abs_path(self, ref_type, hash_id): return ref_file_abs_path def count(self, entity): - """Return count of the number of files in the `root` directory. + """Return the count of the number of files in the `root` directory. - Args: - entity (str): Desired entity type (ex. "objects", "metadata"). + :param str entity: Desired entity type (ex. "objects", "metadata"). - Returns: - count (int): Number of files in the directory. + :return: Number of files in the directory. + :rtype: int """ count = 0 directory_to_count = "" @@ -2057,10 +2014,10 @@ def count(self, entity): @staticmethod def _is_int_and_non_negative(file_size): - """Checks whether a given argument is an integer and > 0 and throws exception if not. + """Check whether a given argument is an integer and greater than 0; + throw an exception if not. - Args: - file_size (int): file size to check + :param int file_size: File size to check. """ if file_size is not None: if not isinstance(file_size, int): @@ -2079,12 +2036,11 @@ def _is_int_and_non_negative(file_size): @staticmethod def _validate_string(string, arg, method): - """Checks whether a string is None or empty and throws an exception if so. + """Check whether a string is None or empty; throw an exception if so. - Args: - string (string): Value to check - arg (string): Name of argument to check - method (string): Calling method for logging purposes + :param str string: Value to check. + :param str arg: Name of the argument to check. + :param str method: Calling method for logging purposes. """ if string is None or string.strip() == "": exception_string = ( @@ -2096,13 +2052,11 @@ def _validate_string(string, arg, method): @staticmethod def _to_bytes(text): - """Convert text to sequence of bytes using utf-8 encoding. - - Args: - text (str): String to convert. + """Convert text to a sequence of bytes using utf-8 encoding. - Returns: - text (bytes): Bytes with utf-8 encoding. + :param str text: String to convert. + :return: Bytes with utf-8 encoding. + :rtype: bytes """ if not isinstance(text, bytes): text = bytes(text, "utf8") From f5bf47ef3077d72f7425a8c614dd44b681a303e4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 29 Dec 2023 15:04:39 -0800 Subject: [PATCH 073/420] Refactor 'client' module to use reStructuredText for sphinx-autodoc --- src/hashstore/client.py | 112 ++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 61 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index dac73fcf..e2f4e5ef 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -189,12 +189,12 @@ def __init__(self): def load_store_properties(self, hashstore_yaml): """Get and return the contents of the current HashStore config file. - Returns: - hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): - store_depth (int): Depth when sharding an object's hex digest. - store_width (int): Width of directories when sharding an object's hex digest. - store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_metadata_namespace (str): Namespace for the HashStore's system metadata. + :return: HashStore properties with the following keys (and values): + - store_depth (int): Depth when sharding an object's hex digest. + - store_width (int): Width of directories when sharding an object's hex digest. + - store_algorithm (str): Hash algorithm used for calculating the object's hex digest. + - store_metadata_namespace (str): Namespace for the HashStore's system metadata. + :rtype: dict """ property_required_keys = [ "store_depth", @@ -234,11 +234,11 @@ class HashStoreClient: MET_TYPE = "metadata" def __init__(self, properties, testflag=None): - """Initialize HashStore and MetacatDB + """Store objects in a given directory into HashStore. - Args: - properties: See FileHashStore for dictionary example - testflag (str): "knbvm" to initialize MetacatDB + :param str origin_dir: Directory to convert. + :param str obj_type: Type of objects ('object' or 'metadata'). + :param int num: Number of files to store. """ factory = HashStoreFactory() @@ -258,12 +258,11 @@ def __init__(self, properties, testflag=None): # Methods relating to testing HashStore with knbvm (test.arcticdata.io) def store_to_hashstore_from_list(self, origin_dir, obj_type, num): - """Store objects in a given directory into HashStore + """Store objects in a given directory into HashStore. - Args: - origin_dir (str): Directory to convert - obj_type (str): 'object' or 'metadata' - num (int): Number of files to store + :param str origin_dir: Directory to convert. + :param str obj_type: Type of objects ('object' or 'metadata'). + :param int num: Number of files to store. """ info_msg = f"HashStore Client - Begin storing {obj_type} objects." logging.info(info_msg) @@ -310,8 +309,7 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): def try_store_object(self, obj_tuple): """Store an object to HashStore and log exceptions as warning. - Args: - obj_tuple: See HashStore store_object signature for details. + :param obj_tuple: See HashStore store_object signature for details. """ try: self.hashstore.store_object(*obj_tuple) @@ -321,10 +319,10 @@ def try_store_object(self, obj_tuple): print(so_exception) def try_store_metadata(self, obj_tuple): - """Store an object to HashStore and log exceptions as warning. + """Store a metadata document to HashStore and log exceptions as warning. Args: - obj_tuple: See HashStore store_object signature for details. + obj_tuple: See HashStore store_metadata signature for details. """ try: self.hashstore.store_metadata(*obj_tuple) @@ -336,10 +334,9 @@ def try_store_metadata(self, obj_tuple): def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): """Retrieve objects or metadata from a Hashstore and validate the content. - Args: - origin_dir (str): Directory to convert - obj_type (str): 'object' or 'metadata' - num (int): Number of files to store + :param str origin_dir: Directory to convert. + :param str obj_type: Type of objects ('object' or 'metadata'). + :param int num: Number of files to store. """ info_msg = ( f"HashStore Client - Begin retrieving and validating {obj_type} objects." @@ -384,8 +381,7 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): def validate_object(self, obj_tuple): """Retrieves an object from HashStore and validates its checksum. - Args: - obj_tuple: pid_guid, obj_checksum_algo, obj_checksum + :param obj_tuple: Tuple containing pid_guid, obj_checksum_algo, obj_checksum. """ pid_guid = obj_tuple[0] algo = obj_tuple[1] @@ -407,10 +403,9 @@ def validate_object(self, obj_tuple): return def validate_metadata(self, obj_tuple): - """Retrieves a metadata from HashStore and validates its checksum + """Retrieves a metadata from HashStore and validates its checksum. - Args: - obj_tuple: pid_guid, format_id, obj_checksum, obj_algorithm + :param obj_tuple: Tuple containing pid_guid, format_id, obj_checksum, obj_algorithm. """ pid_guid = obj_tuple[0] namespace = obj_tuple[1] @@ -433,11 +428,11 @@ def validate_metadata(self, obj_tuple): return def delete_objects_from_list(self, origin_dir, obj_type, num): - """Store objects in a given directory into HashStore - Args: - origin_dir (str): Directory to convert - obj_type (str): 'object' or 'metadata' - num (int): Number of files to store + """Deletes objects in a given directory into HashStore. + + :param str origin_dir: Directory to convert. + :param str obj_type: Type of objects ('object' or 'metadata'). + :param int num: Number of files to store. """ info_msg = f"HashStore Client - Begin deleting {obj_type} objects." logging.info(info_msg) @@ -482,10 +477,9 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): logging.info(content) def try_delete_object(self, obj_pid): - """Delete an object to HashStore and log exceptions as warning. + """Delete an object from HashStore and log exceptions as a warning. - Args: - obj_pid (str): Pid of object to delete + :param str obj_pid: PID of the object to delete. """ try: self.hashstore.delete_object(obj_pid) @@ -495,10 +489,9 @@ def try_delete_object(self, obj_pid): print(do_exception) def try_delete_metadata(self, obj_tuple): - """Delete an object to HashStore and log exceptions as warning. + """Delete an object from HashStore and log exceptions as a warning. - Args: - obj_tuple: pid_guid, format_id (namespace) + :param obj_tuple: Tuple containing the PID and format ID (namespace). """ pid_guid = obj_tuple[0] namespace = obj_tuple[1] @@ -544,11 +537,10 @@ def __init__(self, hashstore_path, hashstore): self.db_yaml_dict[key] = checked_property def get_object_metadata_list(self, origin_directory, num): - """Query the metacat db for the full obj and metadata list and order by guid. + """Query the Metacat database for the full object and metadata list, ordered by GUID. - Args: - origin_directory (string): 'var/metacat/data' or 'var/metacat/documents' - num (int): Number of rows to retrieve from metacat db + :param str origin_directory: 'var/metacat/data' or 'var/metacat/documents'. + :param int num: Number of rows to retrieve from the Metacat database. """ # Create a connection to the database db_user = self.db_yaml_dict["db_user"] @@ -610,15 +602,14 @@ def get_object_metadata_list(self, origin_directory, num): def refine_list_for_objects(self, metacat_obj_list, action): """Refine a list of objects by checking for file existence and removing duplicates. - Args: - metacat_obj_list (List): List of tuple objects representing rows from metacat db - action (string): "store", "retrieve" or "delete". - "store" will create a list of objects to store that do not exist in HashStore. - "retrieve" will create a list of objects that exist in HashStore. - "delete" will create a list of object pids - - Returns: - refined_object_list (List): List of tuple objects based on "action" + :param List metacat_obj_list: List of tuple objects representing rows from Metacat database. + :param str action: Action to perform. Options: "store", "retrieve", or "delete". + - "store": Create a list of objects to store that do not exist in HashStore. + - "retrieve": Create a list of objects that exist in HashStore. + - "delete": Create a list of object PIDs to delete. + + :return: Refined list of tuple objects based on the specified action. + :rtype: List """ refined_object_list = [] for tuple_item in metacat_obj_list: @@ -663,15 +654,14 @@ def refine_list_for_objects(self, metacat_obj_list, action): def refine_list_for_metadata(self, metacat_obj_list, action): """Refine a list of metadata by checking for file existence and removing duplicates. - Args: - metacat_obj_list (List): List of tuple objects representing rows from metacat db - action (string): "store", "retrieve" or "delete". - "store" will create a list of metadata to store that do not exist in HashStore. - "retrieve" will create a list of metadata that exist in HashStore. - "delete" will create a list of metadata pids with their format_ids - - Returns: - refined_object_list (List): List of tuple metadata based on "action" + :param List metacat_obj_list: List of tuple objects representing rows from metacat db. + :param str action: Action to perform - "store", "retrieve", or "delete". + - "store": Create a list of metadata to store that do not exist in HashStore. + - "retrieve": Create a list of metadata that exist in HashStore. + - "delete": Create a list of metadata pids with their format_ids. + + :return: List of tuple metadata based on the specified action. + :rtype: List """ refined_metadata_list = [] for tuple_item in metacat_obj_list: From 97ea4e805a8d11468793068dc318dfe91e3a3b02 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 29 Dec 2023 15:19:12 -0800 Subject: [PATCH 074/420] Refactor 'hashstore' module to use reStructuredText for sphinx-autodoc --- src/hashstore/hashstore.py | 293 +++++++++++++++++++------------------ 1 file changed, 147 insertions(+), 146 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index d1ff440c..37e228d8 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -25,63 +25,59 @@ def store_object( checksum_algorithm, expected_object_size, ): - """The `store_object` method is responsible for the atomic storage of objects to - disk using a given stream. Upon successful storage, the method returns a ObjectMetadata - object containing relevant file information, such as the file's id (which can be - used to locate the object on disk), the file's size, and a hex digest dict of algorithms - and checksums. Storing an object with `store_object` also tags an object (creating - references) which allow the object to be discoverable. - - `store_object` also ensures that an object is stored only once by synchronizing multiple - calls and rejecting calls to store duplicate objects. Note, calling `store_object` without - a pid is a possibility, but should only store the object without tagging the object. - It is then the caller's responsibility to finalize the process by calling `tag_object` - after veriftying the correct object is stored. - - The file's id is determined by calculating the object's content identifier based on - the store's default algorithm, which is also used as the permanent address of the file. - The file's identifier is then sharded using the store's configured depth and width, - delimited by '/' and concatenated to produce the final permanent address - and is stored in the `/store_directory/objects/` directory. - - By default, the hex digest map includes the following hash algorithms: - md5, sha1, sha256, sha384, sha512 - which are the most commonly used algorithms in - dataset submissions to DataONE and the Arctic Data Center. If an additional algorithm - is provided, the `store_object` method checks if it is supported and adds it to the - hex digests dict along with its corresponding hex digest. An algorithm is considered - "supported" if it is recognized as a valid hash algorithm in the `hashlib` library. - - Similarly, if a file size and/or checksum & checksum_algorithm value are provided, - `store_object` validates the object to ensure it matches the given arguments - before moving the file to its permanent address. - - Args: - pid (string): Authority-based identifier. - data (mixed): String or path to object. - additional_algorithm (string): Additional hex digest to include. - checksum (string): Checksum to validate against. - checksum_algorithm (string): Algorithm of supplied checksum. - expected_object_size (int): Size of object to verify - - Returns: - object_metadata (ObjectMetadata): Object that contains the permanent address, - file size and hex digest dictionary. + """Atomic storage of objects to disk using a given stream. + + The `store_object` method ensures atomic storage of objects to disk. Upon successful + storage, it returns an ObjectMetadata object containing relevant file information, + such as the file's id (used to locate the object on disk), the file's size, and a hex digest + dictionary of algorithms and checksums. The method also tags the object, creating references + for discoverability. + + `store_object` ensures that an object is stored only once by synchronizing multiple calls + and rejecting attempts to store duplicate objects. If called without a pid, it stores the + object without tagging, and it becomes the caller's responsibility to finalize the process + by calling `tag_object` after verifying the correct object is stored. + + The file's id is determined by calculating the object's content identifier based on the + store's default algorithm, which is also the permanent address of the file. The file's + identifier is then sharded using the store's configured depth and width, delimited by '/', + and concatenated to produce the final permanent address. This address is stored in the + `/store_directory/objects/` directory. + + By default, the hex digest map includes common hash algorithms (md5, sha1, sha256, sha384, + sha512). If an additional algorithm is provided, the method checks if it is supported and + adds it to the hex digests dictionary along with its corresponding hex digest. An algorithm + is considered "supported" if it is recognized as a valid hash algorithm in the `hashlib` + library. + + If file size and/or checksum & checksum_algorithm values are provided, `store_object` + validates the object to ensure it matches the given arguments before moving the file to + its permanent address. + + :param str pid: Authority-based identifier. + :param mixed data: String or path to the object. + :param str additional_algorithm: Additional hex digest to include. + :param str checksum: Checksum to validate against. + :param str checksum_algorithm: Algorithm of the supplied checksum. + :param int expected_object_size: Size of the object to verify. + + :return: ObjectMetadata - Object containing the permanent address, file size, and + hex digest dictionary. """ raise NotImplementedError() @abstractmethod def tag_object(self, pid, cid): - """The `tag_object` method creates references that allow objects stored in HashStore - to be discoverable. Retrieving, deleting or calculating a hex digest of an object is - based on a pid argument; and to proceed, we must be able to find the object associated - with the pid. + """Create references to make objects discoverable in HashStore. - Args: - pid (string): Authority-based or persistent identifier of object - cid (string): Content identifier of object + The `tag_object` method enables operations such as retrieving, deleting, or calculating + a hex digest based on the provided pid argument. To perform these actions, it's crucial + to locate the object associated with the given pid. - Returns: - boolean: `True` upon successful tagging. + :param str pid: Authority-based or persistent identifier of the object. + :param str cid: Content identifier of the object. + + :return: bool - `True` upon successful tagging. """ raise NotImplementedError() @@ -89,156 +85,158 @@ def tag_object(self, pid, cid): def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - """Confirms that an object_metadata's content is equal to the given values. + """Confirm equality of content in an ObjectMetadata. + + The `verify_object` method verifies that the content in the provided `object_metadata` + matches the specified values. + + :param ObjectMetadata object_metadata: ObjectMetadata object. + :param str checksum: Value of the checksum. + :param str checksum_algorithm: Algorithm of the checksum. + :param int expected_file_size: Size of the temporary file. - Args: - object_metadata (ObjectMetadata): object_metadata object - checksum (string): Value of checksum - checksum_algorithm (string): Algorithm of checksum - expected_file_size (int): Size of the tmp file + :return: None """ raise NotImplementedError() @abstractmethod def find_object(self, pid): - """The `find_object` method checks whether an object referenced by a pid exists - and returns the content identifier. + """Check if an object referenced by a pid exists and retrieve its content identifier. - Args: - pid (string): Authority-based or persistent identifier of object + The `find_object` method validates the existence of an object based on the provided + pid and returns the associated content identifier. - Returns: - cid (string): Content identifier of the object + :param str pid: Authority-based or persistent identifier of the object. + + :return: str - Content identifier of the object. """ raise NotImplementedError() @abstractmethod def store_metadata(self, pid, metadata, format_id): - """The `store_metadata` method is responsible for adding and/or updating metadata - (ex. `sysmeta`) to disk using a given path/stream, a persistent identifier `pid` - and a metadata `format_id`. The metadata object's permanent address, which is - determined by calculating the SHA-256 hex digest of the provided `pid` + `format_id`. - - Upon successful storage of metadata, `store_metadata` returns a string that - represents the file's permanent address. Lastly, the metadata objects are stored - in parallel to objects in the `/store_directory/metadata/` directory. - - Args: - pid (string): Authority-based identifier. - format_id (string): Metadata format - metadata (mixed): String or path to metadata document. - - Returns: - metadata_cid (string): Address of the metadata document. + """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. + + The `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` + to determine the permanent address of the metadata object. The permanent address is + calculated by obtaining the SHA-256 hex digest of the concatenation of `pid` & `format_id`. + + Upon successful storage of metadata, the method returns a string representing the file's + permanent address. Metadata objects are stored in parallel to objects in the + `/store_directory/metadata/` directory. + + :param str pid: Authority-based identifier. + :param mixed metadata: String or path to the metadata document. + :param str format_id: Metadata format. + + :return: str - Address of the metadata document. """ raise NotImplementedError() @abstractmethod def retrieve_object(self, pid): - """The `retrieve_object` method retrieves an object from disk using a given - persistent identifier (pid). If the object exists, the method will open and return - a buffered object stream ready to read from. + """Retrieve an object from disk using a persistent identifier (pid). + + The `retrieve_object` method opens and returns a buffered object stream ready for reading + if the object associated with the provided `pid` exists on disk. - Args: - pid (string): Authority-based identifier. + :param str pid: Authority-based identifier. - Returns: - obj_stream (io.BufferedReader): A buffered stream of a data object. + :return: io.BufferedReader - Buffered stream of the data object. """ raise NotImplementedError() @abstractmethod def retrieve_metadata(self, pid, format_id): - """The 'retrieve_metadata' method retrieves the metadata object from disk using - a given persistent identifier (pid) and metadata namespace (format_id). - If the object exists (determined by calculating the metadata object's permanent - address using the SHA-256 hash of the given pid+format_id), the method will open - and return a buffered metadata stream ready to read from. - - Args: - pid (string): Authority-based identifier - format_id (string): Metadata format - - Returns: - metadata_stream (io.BufferedReader): A buffered stream of a metadata object. + """Retrieve the metadata object from disk using a persistent identifier (pid) + and metadata namespace (format_id). + + The `retrieve_metadata` method calculates the metadata object's permanent address + by hashing the concatenation of the given `pid` and `format_id`. If the object + exists, the method opens and returns a buffered metadata stream ready for reading. + + :param str pid: Authority-based identifier. + :param str format_id: Metadata format. + + :return: io.BufferedReader - Buffered stream of the metadata object. """ raise NotImplementedError() @abstractmethod def delete_object(self, pid): - """The 'delete_object' method deletes an object permanently from disk using a - given persistent identifier. + """Delete an object permanently from disk using a persistent identifier (pid). + + The `delete_object` method removes the object associated with the provided `pid` from + disk, resulting in the permanent deletion of the object. - Args: - pid (string): Authority-based identifier. + :param str pid: Authority-based identifier. - Returns: - boolean: `True` upon successful deletion. + :return: bool - `True` upon successful deletion. """ raise NotImplementedError() @abstractmethod def delete_metadata(self, pid, format_id): - """The 'delete_metadata' method deletes a metadata document permanently - from disk using a given persistent identifier and format_id. + """Delete a metadata document permanently from disk using a persistent identifier (pid) + and metadata namespace (format_id). - Args: - pid (string): Authority-based identifier - format_id (string): Metadata format + The `delete_metadata` method removes the metadata document associated with the provided + `pid` and `format_id` from disk, resulting in its permanent deletion. - Returns: - boolean: `True` upon successful deletion. + :param str pid: Authority-based identifier. + :param str format_id: Metadata format. + + :return: bool - `True` upon successful deletion. """ raise NotImplementedError() @abstractmethod def get_hex_digest(self, pid, algorithm): - """The 'get_hex_digest' method calculates the hex digest of an object that exists + """Calculate the hex digest of an object in HashStore. + + The `get_hex_digest` method calculates the hex digest of an object that exists in HashStore using a given persistent identifier and hash algorithm. - Args: - pid (string): Authority-based identifier. - algorithm (string): Algorithm of hex digest to generate. + :param str pid: Authority-based identifier. + :param str algorithm: Algorithm of hex digest to generate. - Returns: - hex_digest (string): Hex digest of the object. + :return: str - Hex digest of the object. """ raise NotImplementedError() class HashStoreFactory: - """A factory class for creating `HashStore`-like objects (classes - that implement the 'HashStore' abstract methods) + """A factory class for creating `HashStore`-like objects. - This factory class provides a method to retrieve a `HashStore` object - based on a given module (ex. "hashstore.filehashstore.filehashstore") - and class name (ex. "FileHashStore"). + The `HashStoreFactory` class serves as a factory for creating `HashStore`-like objects, + which are classes that implement the 'HashStore' abstract methods. + + This factory class provides a method to retrieve a `HashStore` object based on a given module + (e.g., "hashstore.filehashstore.filehashstore") and class name (e.g., "FileHashStore"). """ @staticmethod def get_hashstore(module_name, class_name, properties=None): """Get a `HashStore`-like object based on the specified `module_name` and `class_name`. - Args: - module_name (str): Name of package (ex. "hashstore.filehashstore") \n - class_name (str): Name of class in the given module (ex. "FileHashStore") \n - properties (dict, optional): Desired HashStore properties, if 'None', default values - will be used. \n - Example Properties Dictionary: - { - "store_path": "var/metacat",\n - "store_depth": 3,\n - "store_width": 2,\n - "store_algorithm": "sha256",\n - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0"\n - } - - Returns: - HashStore: A hash store object based on the given `module_name` and `class_name` - - Raises: - ModuleNotFoundError: If module is not found - AttributeError: If class does not exist within the module + The `get_hashstore` method retrieves a `HashStore`-like object based on the provided + `module_name` and `class_name`, with optional custom properties. + + :param str module_name: Name of the package (e.g., "hashstore.filehashstore"). + :param str class_name: Name of the class in the given module (e.g., "FileHashStore"). + :param dict properties: Desired HashStore properties (optional). If `None`, default values + will be used. Example Properties Dictionary: + { + "store_path": "var/metacat", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "sha256", + "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0" + } + + :return: HashStore - A hash store object based on the given `module_name` and `class_name`. + + :raises ModuleNotFoundError: If the module is not found. + :raises AttributeError: If the class does not exist within the module. """ # Validate module if importlib.util.find_spec(module_name) is None: @@ -259,11 +257,14 @@ def get_hashstore(module_name, class_name, properties=None): class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])): """Represents metadata associated with an object. - Attributes: - id (str): A unique identifier for the object (Hash ID, hex digest). - obj_size (bytes): The size of the object in bytes. - hex_digests (list, optional): A list of hex digests to validate objects - (md5, sha1, sha256, sha384, sha512) + The `ObjectMetadata` class represents metadata associated with an object, + including a unique identifier (`id`), the size of the object in bytes (`obj_size`), + and an optional list of hex digests (`hex_digests`) to validate objects. + + :param str id: A unique identifier for the object (Hash ID, hex digest). + :param bytes obj_size: The size of the object in bytes. + :param list hex_digests: A list of hex digests to validate objects + (md5, sha1, sha256, sha384, sha512) (optional). """ # Default value to prevent dangerous default value From c4f4329bafafd9a608c1c1ca27e23a4c3eb65d18 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 29 Dec 2023 15:27:02 -0800 Subject: [PATCH 075/420] Rename 'client' module to 'hashstoreclient' and update tests --- .../{client.py => hashstoreclient.py} | 0 tests/test_hashstore_client.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) rename src/hashstore/{client.py => hashstoreclient.py} (100%) diff --git a/src/hashstore/client.py b/src/hashstore/hashstoreclient.py similarity index 100% rename from src/hashstore/client.py rename to src/hashstore/hashstoreclient.py diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 96c9ad45..8f176452 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -2,7 +2,7 @@ import sys import os from pathlib import Path -from hashstore import client +from hashstore import hashstoreclient def test_create_hashstore(tmp_path): @@ -29,7 +29,7 @@ def test_create_hashstore(tmp_path): sys.path.append(client_directory) # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() + hashstoreclient.main() hashstore_yaml = Path(client_test_store + "/hashstore.yaml") hashstore_object_path = Path(client_test_store + "/objects") @@ -66,7 +66,7 @@ def test_get_checksum(capsys, store, pids): sys.path.append(client_directory) # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() + hashstoreclient.main() capsystext = capsys.readouterr().out expected_output = ( @@ -99,7 +99,7 @@ def test_store_object(store, pids): sys.path.append(client_directory) # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() + hashstoreclient.main() assert store.exists("objects", pids[pid][store.algorithm]) @@ -131,7 +131,7 @@ def test_store_metadata(store, pids): sys.path.append(client_directory) # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() + hashstoreclient.main() assert store.exists("metadata", pids[pid]["metadata_cid"]) @@ -159,7 +159,7 @@ def test_retrieve_objects(capsys, pids, store): sys.path.append(client_directory) # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() + hashstoreclient.main() object_stream = store.retrieve_object(pid) object_content = ( @@ -201,7 +201,7 @@ def test_retrieve_metadata(capsys, pids, store): sys.path.append(client_directory) # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() + hashstoreclient.main() metadata_stream = store.retrieve_metadata(pid, namespace) metadata_content = ( @@ -239,7 +239,7 @@ def test_delete_objects(pids, store): sys.path.append(client_directory) # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() + hashstoreclient.main() assert not store.exists("objects", pids[pid][store.algorithm]) @@ -271,6 +271,6 @@ def test_delete_metadata(pids, store): sys.path.append(client_directory) # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() + hashstoreclient.main() assert not store.exists("metadata", pids[pid]["metadata_cid"]) From 255673e51a9b2f64e05e552ad25de2e18ae7ae78 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 29 Dec 2023 15:36:08 -0800 Subject: [PATCH 076/420] Update README.md client section and add new 'HashStore Overview' section --- README.md | 156 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 134 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 0ddaac61..68a93a0a 100644 --- a/README.md +++ b/README.md @@ -16,20 +16,33 @@ DataONE in general, and HashStore in particular, are open source, community proj Documentation is a work in progress, and can be found on the [Metacat repository](https://github.com/NCEAS/metacat/blob/feature-1436-storage-and-indexing/docs/user/metacat/source/storage-subsystem.rst#physical-file-layout) as part of the storage redesign planning. Future updates will include documentation here as the package matures. -## Development build +## HashStore Overview -HashStore is a python package, and built using the [Python Poetry](https://python-poetry.org) build tool. +HashStore is a content-addressable file management system that utilizes the content identifier of an object to address files. The system stores both objects, references (refs) and metadata in its respective directories and provides an API for interacting with the store. HashStore storage classes (like `FileHashStore`) must implement the HashStore interface to ensure the expected usage of HashStore. -To install `hashstore` locally, create a virtual environment for python 3.9+, -install poetry, and then install or build the package with `poetry install` or `poetry build`, respectively. +###### Public API Methods +- store_object +- verify_object +- tag_object +- find_object +- store_metadata +- retrieve_object +- retrieve_metadata +- delete_object +- delete_metadata +- get_hex_digest + +For details, please see the HashStore interface (HashStore.java) -To run tests, navigate to the root directory and run `pytest -s`. The test suite contains tests that -take a longer time to run (relating to the storage of large files) - to execute all tests, run -`pytest --run-slow`. To see detailed -## Usage Example +###### How do I create a HashStore? -To view more details about the Public API - see 'hashstore.py` interface documentation +To create or interact with a HashStore, instantiate a HashStore object with the following set of properties: +- store_path +- store_depth +- store_width +- store_algorithm +- store_metadata_namespace ```py from hashstore import HashStoreFactory @@ -54,49 +67,148 @@ my_store = factory.get_hashstore(module_name, class_name, properties) # Store objects (.../[hashstore_path]/objects/) pid = "j.tao.1700.1" object = "/path/to/your/object.data" -hash_address = my_store.store_object(pid, object) -object_cid = hash_address.id +object_metadata = my_store.store_object(pid, object) +object_cid = object_metadata.id # Store metadata (.../[hashstore_path]/metadata/) # By default, storing metadata will use the given properties namespace `format_id` pid = "j.tao.1700.1" sysmeta = "/path/to/your/sysmeta/document.xml" metadata_cid = my_store.store_metadata(pid, sysmeta) -``` -If you want to store other types of metadata, add an additional `format_id`. -```py +# If you want to store other types of metadata, add an additional `format_id`. pid = "j.tao.1700.1" metadata = "/path/to/your/metadata/document.json" format_id = "http://custom.metadata.com/json/type/v1.0" metadata_cid = my_store.store_metadata(pid, metadata, format_id) + +# ... +``` + +###### Working with objects (store, retrieve, delete) + +In HashStore, objects are first saved as temporary files while their content identifiers are calculated. Once the default hash algorithm list and their hashes are generated, objects are stored in their permanent location using the store's algorithm's corresponding hash value, the store depth and the store width. Lastly, reference files are created for the object so that they can be found and retrieved given an identifier (ex. persistent identifier (pid)). Note: Objects are also stored once and only once. + +By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identfiier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. And to finalize the process (to make the object discoverable), the client calls `tag_object``. In summary, there are two expected paths to store an object: + +```py +# All-in-one process which stores, validates and tags an object +objectMetadata objInfo = store_object(InputStream, pid, additionalAlgorithm, checksum, checksumAlgorithm, objSize) + +# Manual Process +# Store object +obj_metadata = store_object(InputStream) +# Validate object, throws exceptions if there is a mismatch and deletes the associated file +verify_object(objInfo, checksum, checksumAlgorithn, objSize) +# Tag object, makes the object discoverable (find, retrieve, delete) +tag_object(pid, cid) +``` + +**How do I retrieve an object if I have the pid?** +- To retrieve an object, call the Public API method `retrieve_object` which opens a stream to the object if it exists. + +**How do I find an object or check that it exists if I have the pid?** +- To find the location of the object, call the Public API method `find_object` which will return the content identifier (cid) of the object. +- This cid can then be used to locate the object on disk by following HashStore's store configuration. + +**How do I delete an object if I have the pid?** +- To delete an object, call the Public API method `delete_object` which will delete the object and its associated references and reference files where relevant. +- Note, `delete_object` and `tag_object` calls are synchronized on their content identifier values so that the shared reference files are not unintentionally modified concurrently. An object that is in the process of being deleted should not be tagged, and vice versa. These calls have been implemented to occur sequentially to improve clarity in the event of an unexpected conflict or issue. + + +###### Working with metadata (store, retrieve, delete) + +HashStore's '/metadata' directory holds all metadata for objects stored in HashStore. To differentiate between metadata documents for a given object, HashStore includes the 'format_id' (format or namespace of the metadata) when generating the address of the metadata document to store (the hash of the 'pid' + 'format_id'). By default, calling `store_metadata` will use HashStore's default metadata namespace as the 'format_id' when storing metadata. Should the calling app wish to store multiple metadata files about an object, the client app is expected to provide a 'format_id' that represents an object format for the metadata type (ex. `store_metadata(stream, pid, format_id)`). + +**How do I retrieve a metadata file?** +- To find a metadata object, call the Public API method `retrieve_metadata` which returns a stream to the metadata file that's been stored with the default metadata namespace if it exists. +- If there are multiple metadata objects, a 'format_id' must be specified when calling `retrieve_metadata` (ex. `retrieve_metadata(pid, format_id)`) + +**How do I delete a metadata file?** +- Like `retrieve_metadata`, call the Public API method `delete_metadata` which will delete the metadata object associated with the given pid. +- If there are multiple metadata objects, a 'format_id' must be specified when calling `delete_metadata` to ensure the expected metadata object is deleted. + + +###### What are HashStore reference files? + +HashStore assumes that every object to store has a respective identifier. This identifier is then used when storing, retrieving and deleting an object. In order to facilitate this process, we create two types of reference files: +- pid (persistent identifier) reference files +- cid (content identifier) reference files + +These reference files are implemented in HashStore underneath the hood with no expectation for modification from the calling app/client. The one and only exception to this process when the calling client/app does not have an identifier, and solely stores an objects raw bytes in HashStore (calling `store_object(stream)`). + +**'pid' Reference Files** +- Pid (persistent identifier) reference files are created when storing an object with an identifier. +- Pid reference files are located in HashStores '/refs/pid' directory +- If an identifier is not available at the time of storing an object, the calling app/client must create this association between a pid and the object it represents by calling `tag_object` separately. +- Each pid reference file contains a string that represents the content identifier of the object it references +- Like how objects are stored once and only once, there is also only one pid reference file for each object. + +**'cid' Reference Files** +- Cid (content identifier) reference files are created at the same time as pid reference files when storing an object with an identifier. +- Cid reference files are located in HashStore's '/refs/cid' directory +- A cid reference file is a list of all the pids that reference a cid, delimited by a new line ("\n") character + + +###### What does HashStore look like? + +``` +# Example layout in HashStore with a single file stored along with its metadata and reference files. +# This uses a store depth of 3, with a width of 2 and "SHA-256" as its default store algorithm +## Notes: +## - Objects are stored using their content identifier as the file address +## - The reference file for each pid contains a single cid +## - The reference file for each cid contains multiple pids each on its own line + +.../metacat/hashstore/ +└─ objects + └─ /d5/95/3b/d802fa74edea72eb941...00d154a727ed7c2 +└─ metadata + └─ /15/8d/7e/55c36a810d7c14479c9...b20d7df66768b04 +└─ refs + └─ pid/0d/55/5e/d77052d7e166017f779...7230bcf7abcef65e + └─ cid/d5/95/3b/d802fa74edea72eb941...00d154a727ed7c2 +hashstore.yaml ``` +## Development build + +HashStore is a python package, and built using the [Python Poetry](https://python-poetry.org) build tool. + +To install `hashstore` locally, create a virtual environment for python 3.9+, +install poetry, and then install or build the package with `poetry install` or `poetry build`, respectively. + +To run tests, navigate to the root directory and run `pytest -s`. The test suite contains tests that +take a longer time to run (relating to the storage of large files) - to execute all tests, run +`pytest --run-slow`. To see detailed + +## HashStore Client + How to use HashStore client (command line app) ```sh # Step 1: Create a HashStore -$ python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" # Get the checksum of a data object -$ python './src/hashstore/client.py' /path/to/store/ -getchecksum -pid=content_identifier -algo=SHA-256 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -getchecksum -pid=content_identifier -algo=SHA-256 # Store a data object -$ python './src/hashstore/client.py' /path/to/store/ -storeobject -pid=content_identifier -path=/path/to/object +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storeobject -pid=content_identifier -path=/path/to/object # Store a metadata object -$ python './src/hashstore/client.py' /path/to/store/ -storemetadata -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storemetadata -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 # Retrieve a data object -$ python './src/hashstore/client.py' /path/to/store/ -retrieveobject -pid=content_identifier +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrieveobject -pid=content_identifier # Retrieve a metadata object -$ python './src/hashstore/client.py' /path/to/store/ -retrievemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrievemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 # Delete a data object -$ python './src/hashstore/client.py' /path/to/store/ -deleteobject -pid=content_identifier +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deleteobject -pid=content_identifier # Delete a metadata file -$ python './src/hashstore/client.py' /path/to/store/ -deletemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deletemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 ``` ## License From f704389d4675307109deb3f106f78e639153ecaa Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 29 Dec 2023 15:39:17 -0800 Subject: [PATCH 077/420] Refactor delete method to not remove empty directories --- src/hashstore/filehashstore.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index cec12da5..108a406e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1857,10 +1857,12 @@ def delete(self, entity, file): try: os.remove(realpath) - except OSError: - pass - else: - self._remove_empty(os.path.dirname(realpath)) + except OSError as err: + exception_string = ( + f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err def _remove_empty(self, subpath): """Successively remove all empty folders starting with `subpath` and From a9fd5bf5d7c6b16ce443e4ee759ab461ed03727f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 30 Dec 2023 11:54:11 -0800 Subject: [PATCH 078/420] Fix bug in 'tag_object' where a pid refs file isn't created when updating a cid refs file that exists --- src/hashstore/filehashstore.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 108a406e..37e08fbf 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -541,29 +541,42 @@ def tag_object(self, pid, cid): try: pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + # Ensure refs tmp folder exists + tmp_root_path = self.get_store_path("refs") / "tmp" + if os.path.exists(tmp_root_path) is False: + self.create_path(tmp_root_path) + + # Proceed to tagging process if os.path.exists(pid_ref_abs_path): + print("Path exists:\n") + print(pid_ref_abs_path) # A pid reference file can only contain one cid exception_string = ( - "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", - pid_ref_abs_path, + "FileHashStore - write_pid_refs_file: pid ref file already exists for" + + pid_ref_abs_path ) logging.error(exception_string) raise FileExistsError(exception_string) elif os.path.exists(cid_ref_abs_path): + # Create the pid refs file + pid_tmp_file = self._mktmpfile(tmp_root_path) + pid_tmp_file_path = pid_tmp_file.name + self._write_pid_refs_file(pid_tmp_file_path, cid) + # Create path for pid ref file in '.../refs/pid' + self.create_path(os.path.dirname(pid_ref_abs_path)) + shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files if it already exists self._update_cid_refs(cid_ref_abs_path, pid) + # Verify refs file content + self._verify_hashstore_references(pid, cid) logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", cid, pid, ) + return True else: # All ref files begin as tmp files and get moved sequentially at once - # Ensure refs tmp folder exists - tmp_root_path = self.get_store_path("refs") / "tmp" - if os.path.exists(tmp_root_path) is False: - self.create_path(tmp_root_path) - # Then write pid_refs_file content into tmp file pid_tmp_file = self._mktmpfile(tmp_root_path) pid_tmp_file_path = pid_tmp_file.name From 44c555bb1e9dd0b2c806fcb53231b33540748c70 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 30 Dec 2023 13:01:23 -0800 Subject: [PATCH 079/420] Refactor '_update_cid_refs' to log warning and not throw exception if pid already exists in cid refs file --- src/hashstore/filehashstore.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 37e08fbf..517886b3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1240,11 +1240,12 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): for _, line in enumerate(f, start=1): value = line.strip() if pid == value: - err_msg = ( + warning_msg = ( f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" + f" cid reference file: {cid_ref_abs_path} " ) - raise ValueError(err_msg) + logging.warning(warning_msg) + return with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) From 66a9990f3a05adbf21d8592f6f93707c84df9f47 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 30 Dec 2023 13:02:25 -0800 Subject: [PATCH 080/420] Add new pytests for updates to 'tag_object' method --- tests/test_filehashstore_references.py | 79 +++++++++++++++++++++----- 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index e4974bcc..e3f81398 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -96,20 +96,70 @@ def test_tag_object_cid_refs_file_exists(pids, store): assert not os.path.exists(second_cid_hash) -def test_tag_object_cid_refs_update(pids, store): +def test_tag_object_cid_refs_update_cid_refs_updated(store): """Test tag object updates a cid reference file that already exists.""" test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - cid = object_metadata.id - store.tag_object(pid, cid) - store.tag_object("dou.test.1", cid) - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - cid_ref_file_pid = f.read() + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + # Store data only + object_metadata = store.store_object(None, path) + cid = object_metadata.id + # Tag object + store.tag_object(pid, cid) + # Tag the cid with another pid + additional_pid = "dou.test.1" + store.tag_object(additional_pid, cid) + + # Read cid file to confirm cid refs file contains the additional pid + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == additional_pid + + +def test_tag_object_cid_refs_update_pid_refs_created(store): + """Test tag object creates a pid reference file when called to tag an object + that already exists.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + # Store data only + object_metadata = store.store_object(None, path) + cid = object_metadata.id + # Tag object + store.tag_object(pid, cid) + # Tag the cid with another pid + additional_pid = "dou.test.1" + store.tag_object(additional_pid, cid) + + pid_refs_file_path = store.get_refs_abs_path("pid", additional_pid) + assert os.path.exists(pid_refs_file_path) + + +def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): + """Test that tag_object creates a missing pid refs file that somehow disappeared + when called to tag a cid that already contains the pid.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + store.tag_object(pid, object_metadata.id) + cid = object_metadata.id + # Manually update the cid refs, pid refs file missing at this point + additional_pid = "dou.test.1" + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store._update_cid_refs(cid_ref_abs_path, additional_pid) + + # Confirm the pid refs file is missing + pid_refs_file_path = store.get_refs_abs_path("pid", additional_pid) + assert not os.path.exists(pid_refs_file_path) - assert "dou.test.1" in cid_ref_file_pid + # Call tag_object, this should create the missing pid refs file + store.tag_object(additional_pid, cid) + + # Confirm it has been created + assert os.path.exists(pid_refs_file_path) def test_verify_object(pids, store): @@ -278,14 +328,15 @@ def test_update_cid_refs_content_multiple(pids, store): def test_update_cid_refs_content_pid_exists(pids, store): - """Test that update_cid_ref throws exception if pid already exists.""" + """Test that update_cid_ref does not throw exception if pid already exists + and proceeds to complete the tagging process (verify_object)""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store._write_cid_refs_file(cid_ref_abs_path, pid) - with pytest.raises(ValueError): - store._update_cid_refs(cid_ref_abs_path, pid) + # Exception should not be thrown + store._update_cid_refs(cid_ref_abs_path, pid) def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): From 13a8402dc501e55b2f6be76efd9b23ba65a62a51 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 30 Dec 2023 14:52:28 -0800 Subject: [PATCH 081/420] Refactor '_validate_arg_object' method which is called by 'verify_object' to delete object in question if determined to be invalid --- src/hashstore/filehashstore.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 517886b3..f5f4144c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -505,7 +505,7 @@ def verify_object( pid=None, checksum=checksum, checksum_algorithm=checksum_algorithm_checked, - entity=None, + entity="objects", hex_digests=object_metadata_hex_digests, tmp_file_name=None, tmp_file_size=object_metadata_file_size, @@ -1572,7 +1572,8 @@ def _validate_arg_object( tmp_file_size, file_size_to_validate, ): - """Evaluates an object's integrity and raises an exception if there is a mismatch. + """Evaluates an object's integrity - if there is a mismatch, deletes the object + in question and raises an exception. :param str pid: For logging purposes. :param str checksum: Value of the checksum to check. @@ -1619,6 +1620,7 @@ def _validate_arg_object( + f" HexDigest: {hex_digest_stored}." ) if pid is not None: + # Delete the tmp file self.delete(entity, tmp_file_name) exception_string_for_pid = ( exception_string + f"Tmp file ({tmp_file_name}) deleted." @@ -1626,6 +1628,10 @@ def _validate_arg_object( logging.error(exception_string_for_pid) raise ValueError(exception_string_for_pid) else: + # Delete the object + cid = hex_digests[self.algorithm] + cid_abs_path = self.get_refs_abs_path("cid", cid) + self.delete(entity, cid_abs_path) logging.error(exception_string) raise ValueError(exception_string) From 696a22b50295429946bf9d4e0ec3c9290febbc9b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 30 Dec 2023 14:53:17 -0800 Subject: [PATCH 082/420] Update pytests for changes to validating an object process --- tests/test_filehashstore_references.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index e3f81398..dfbe8183 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -168,8 +168,6 @@ def test_verify_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size @@ -185,8 +183,6 @@ def test_verify_object_exception_incorrect_object_metadata_type(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size @@ -202,13 +198,16 @@ def test_verify_object_exception_incorrect_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm with pytest.raises(ValueError): store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) + cid = object_metadata.id + cid = object_metadata.hex_digests[store.algorithm] + cid_abs_path = store.get_refs_abs_path("cid", cid) + assert not os.path.exists(cid_abs_path) + def test_verify_object_exception_incorrect_checksum(pids, store): """Test verify object raises exception when incorrect checksum is supplied.""" @@ -225,6 +224,11 @@ def test_verify_object_exception_incorrect_checksum(pids, store): object_metadata, "abc123", checksum_algorithm, expected_file_size ) + cid = object_metadata.id + cid = object_metadata.hex_digests[store.algorithm] + cid_abs_path = store.get_refs_abs_path("cid", cid) + assert not os.path.exists(cid_abs_path) + def test_verify_object_exception_incorrect_checksum_algo(pids, store): """Test verify object raises exception when incorrect algorithm is supplied.""" @@ -232,8 +236,6 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) checksum = object_metadata.hex_digests.get(store.algorithm) expected_file_size = object_metadata.obj_size with pytest.raises(ValueError): From 382c838dc179b7046a8983fb856fedac53e1a9f2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 30 Dec 2023 15:02:48 -0800 Subject: [PATCH 083/420] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 68a93a0a..10f4d56a 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ metadata_cid = my_store.store_metadata(pid, metadata, format_id) In HashStore, objects are first saved as temporary files while their content identifiers are calculated. Once the default hash algorithm list and their hashes are generated, objects are stored in their permanent location using the store's algorithm's corresponding hash value, the store depth and the store width. Lastly, reference files are created for the object so that they can be found and retrieved given an identifier (ex. persistent identifier (pid)). Note: Objects are also stored once and only once. -By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identfiier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. And to finalize the process (to make the object discoverable), the client calls `tag_object``. In summary, there are two expected paths to store an object: +By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identfiier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), it will be deleted. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: ```py # All-in-one process which stores, validates and tags an object From d681fbf5e78637f694742409e71f7b51e53279da Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 2 Jan 2024 13:59:32 -0800 Subject: [PATCH 084/420] Remove old pattern of checking for objects or metadata existence before refining the 'metacat_obj_list' for refine_list methods --- src/hashstore/hashstoreclient.py | 94 +++++++++++--------------------- 1 file changed, 33 insertions(+), 61 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index e2f4e5ef..2665c574 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -264,10 +264,11 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): :param str obj_type: Type of objects ('object' or 'metadata'). :param int num: Number of files to store. """ - info_msg = f"HashStore Client - Begin storing {obj_type} objects." + info_msg = f"HashStoreClient - Begin storing {obj_type} objects." logging.info(info_msg) # Object and Metadata list metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) + logging.info(info_msg) # Get list of objects to store from metacat db if obj_type == self.OBJ_TYPE: @@ -619,35 +620,25 @@ def refine_list_for_objects(self, metacat_obj_list, action): item_checksum_algorithm = tuple_item[4] if os.path.exists(filepath_docid_rev): if action == "store": - # If the file has already been stored, skip it - if not self.hashstore.exists( - "objects", self.hashstore.get_sha256_hex_digest(pid_guid) - ): - # This tuple is formed to match 'HashStore' store_object's signature - # Which is '.starmap()'ed when called - store_object_tuple_item = ( - pid_guid, - filepath_docid_rev, - None, - item_checksum, - item_checksum_algorithm, - ) - refined_object_list.append(store_object_tuple_item) + # This tuple is formed to match 'HashStore' store_object's signature + # Which is '.starmap()'ed when called + store_object_tuple_item = ( + pid_guid, + filepath_docid_rev, + None, + item_checksum, + item_checksum_algorithm, + ) + refined_object_list.append(store_object_tuple_item) if action == "retrieve": - if self.hashstore.exists( - "objects", self.hashstore.get_sha256_hex_digest(pid_guid) - ): - retrieve_object_tuple_item = ( - pid_guid, - item_checksum_algorithm, - item_checksum, - ) - refined_object_list.append(retrieve_object_tuple_item) + retrieve_object_tuple_item = ( + pid_guid, + item_checksum_algorithm, + item_checksum, + ) + refined_object_list.append(retrieve_object_tuple_item) if action == "delete": - if self.hashstore.exists( - "objects", self.hashstore.get_sha256_hex_digest(pid_guid) - ): - refined_object_list.append(pid_guid) + refined_object_list.append(pid_guid) return refined_object_list @@ -672,41 +663,22 @@ def refine_list_for_metadata(self, metacat_obj_list, action): item_checksum_algorithm = tuple_item[4] if os.path.exists(filepath_docid_rev): if action == "store": - # If the file has already been stored, skip it - if not self.hashstore.exists( - "metadata", - self.hashstore.get_sha256_hex_digest( - pid_guid + metadata_namespace - ), - ): - tuple_item = (pid_guid, filepath_docid_rev, metadata_namespace) - refined_metadata_list.append(tuple_item) + tuple_item = (pid_guid, filepath_docid_rev, metadata_namespace) + refined_metadata_list.append(tuple_item) if action == "retrieve": - if self.hashstore.exists( - "metadata", - self.hashstore.get_sha256_hex_digest( - pid_guid + metadata_namespace - ), - ): - tuple_item = ( - pid_guid, - metadata_namespace, - item_checksum, - item_checksum_algorithm, - ) - refined_metadata_list.append(tuple_item) + tuple_item = ( + pid_guid, + metadata_namespace, + item_checksum, + item_checksum_algorithm, + ) + refined_metadata_list.append(tuple_item) if action == "delete": - if self.hashstore.exists( - "metadata", - self.hashstore.get_sha256_hex_digest( - pid_guid + metadata_namespace - ), - ): - tuple_item = ( - pid_guid, - metadata_namespace, - ) - refined_metadata_list.append(tuple_item) + tuple_item = ( + pid_guid, + metadata_namespace, + ) + refined_metadata_list.append(tuple_item) return refined_metadata_list From ed36b2a970a8d57ad91cd43d0e7a5116a83b74e7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 2 Jan 2024 15:07:14 -0800 Subject: [PATCH 085/420] Refactor '_delete_cid_refs_file' to return correct False boolean rather than raise an OSError when called to delete a file that still contains references, and update pytests --- src/hashstore/filehashstore.py | 4 ++-- tests/test_filehashstore_interface.py | 11 +++-------- tests/test_filehashstore_references.py | 4 ++-- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f5f4144c..ec450935 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1330,8 +1330,8 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): "FileHashStore - _delete_cid_refs_file: Failed to delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " ) - logging.error(err_msg) - raise OSError(err_msg) + logging.warning(err_msg) + return False else: os.remove(cid_ref_abs_path) return True diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index c06c23d1..32cf661a 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -839,21 +839,16 @@ def test_delete_objects_cid_refs_file(pids, store): def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): """Test delete_object does not delete the cid refs file that still contains ref.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) cid = object_metadata.id cid_refs_abs_path = store.get_refs_abs_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") - _metadata_cid = store.store_metadata(pid, syspath, format_id) - with pytest.raises(OSError): - store.delete_object(pid) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) - assert os.path.exists(cid_refs_file_path) + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) def test_delete_object_pid_empty(store): diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index dfbe8183..c4cb4788 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -404,8 +404,8 @@ def test_delete_cid_refs_file_file_not_empty(pids, store): cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store._write_cid_refs_file(cid_ref_abs_path, pid) - with pytest.raises(OSError): - store._delete_cid_refs_file(cid_ref_abs_path) + is_cid_refs_file_deleted = store._delete_cid_refs_file(cid_ref_abs_path) + assert not is_cid_refs_file_deleted def test_delete_cid_refs_file_file_not_found(pids, store): From 6aab6887a72121e4eef54cb5b993ce4336d9347a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 2 Jan 2024 17:27:10 -0800 Subject: [PATCH 086/420] Add .close() calls to all methods which opens files to be safe --- src/hashstore/filehashstore.py | 40 ++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ec450935..88682395 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -144,8 +144,11 @@ def load_properties(self): logging.critical(exception_string) raise FileNotFoundError(exception_string) # Open file - with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: - yaml_data = yaml.safe_load(file) + with open( + self.hashstore_configuration_yaml, "r", encoding="utf-8" + ) as hs_yaml_file: + yaml_data = yaml.safe_load(hs_yaml_file) + hs_yaml_file.close() # Get hashstore properties hashstore_yaml_dict = {} @@ -221,8 +224,9 @@ def write_properties(self, properties): # Write 'hashstore.yaml' with open( self.hashstore_configuration_yaml, "w", encoding="utf-8" - ) as hashstore_yaml: - hashstore_yaml.write(hashstore_configuration_yaml) + ) as hs_yaml_file: + hs_yaml_file.write(hashstore_configuration_yaml) + hs_yaml_file.close() logging.debug( "FileHashStore - write_properties: Configuration file written to: %s", @@ -387,8 +391,11 @@ def lookup_algo(algo_to_translate): ) logging.critical(exception_string) raise FileNotFoundError(exception_string) - with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: - yaml_data = yaml.safe_load(file) + with open( + self.hashstore_configuration_yaml, "r", encoding="utf-8" + ) as hs_yaml_file: + yaml_data = yaml.safe_load(hs_yaml_file) + hs_yaml_file.close() # Set default store algorithm self.algorithm = lookup_algo(yaml_data["store_algorithm"]) @@ -627,8 +634,9 @@ def find_object(self, pid): raise FileNotFoundError(err_msg) else: # Read the file to get the cid from the pid reference - with open(pid_ref_abs_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + pid_refs_cid = pid_ref_file.read() + pid_ref_file.close() return pid_refs_cid @@ -1102,6 +1110,7 @@ def _write_to_tmp_file_and_get_hex_digests( tmp_file.write(self._to_bytes(data)) for hash_algorithm in hash_algorithms: hash_algorithm.update(self._to_bytes(data)) + tmp_file.close() logging.debug( "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Object stream" + " successfully written to tmp file: %s", @@ -1206,6 +1215,7 @@ def _write_cid_refs_file(self, path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) + cid_ref_file.close() return except Exception as err: @@ -1236,8 +1246,8 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): raise FileNotFoundError(exception_string) try: - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + for _, line in enumerate(cid_ref_file, start=1): value = line.strip() if pid == value: warning_msg = ( @@ -1245,6 +1255,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): + f" cid reference file: {cid_ref_abs_path} " ) logging.warning(warning_msg) + cid_ref_file.close() return with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: @@ -1253,6 +1264,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) + cid_ref_file.close() return except Exception as err: @@ -1280,6 +1292,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): fcntl.flock(cid_ref_file, fcntl.LOCK_EX) # Read the ref file to see if the pid is already referencing the cid cid_ref_file_content = cid_ref_file.read() + cid_ref_file.close() if pid not in cid_ref_file_content: err_msg = ( @@ -1294,6 +1307,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) + cid_ref_file.close() return except Exception as err: @@ -1364,6 +1378,7 @@ def _write_pid_refs_file(self, path, cid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) + pid_ref_file.close() return except Exception as err: @@ -1693,11 +1708,12 @@ def _verify_hashstore_references(self, pid, cid): raise ValueError(exception_string) # Then the pid pid_found = False - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + for _, line in enumerate(cid_ref_file, start=1): value = line.strip() if value == pid: pid_found = True + cid_ref_file.close() if not pid_found: exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" From 6cbe55361df391d51fdf6d4981be58431dcb09ba Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 3 Jan 2024 14:56:42 -0800 Subject: [PATCH 087/420] Refactor '_mktmpfile' method to minimize file descriptor OSErrors by implementing a while-try loop --- src/hashstore/filehashstore.py | 55 ++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 88682395..e5ca77f5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1160,30 +1160,53 @@ def _write_to_tmp_file_and_get_hex_digests( ) logging.error(exception_string) - def _mktmpfile(self, path): + def _mktmpfile(self, path, max_retry=10, wait_delay=60): """Create a temporary file at the given path ready to be written. :param str path: Path to the file location. :return: file object - object with a file-like interface. """ - tmp = NamedTemporaryFile(dir=path, delete=False) - - # Delete tmp file if python interpreter crashes or thread is interrupted - def delete_tmp_file(): - if os.path.exists(tmp.name): - os.remove(tmp.name) + # HashStore's primary purpose of storing objects can require the usage of + # many file descriptors in concurrency/parallel settings, causing the limit + # allowed by an OS to be quickly reached (and preventing files from being stored) + # We will re-try generating the temp file to minimize non-critical errors. + # + # To see the limit on Linux, run the command `ulimit -n` + attempt = 1 + while attempt <= max_retry: + try: + tmp = NamedTemporaryFile(dir=path, delete=False) - atexit.register(delete_tmp_file) + # Delete tmp file if python interpreter crashes or thread is interrupted + def delete_tmp_file(): + if os.path.exists(tmp.name): + os.remove(tmp.name) - # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) - try: - os.chmod(tmp.name, self.fmode) - finally: - os.umask(oldmask) - return tmp + atexit.register(delete_tmp_file) + + # Ensure tmp file is created with desired permissions + if self.fmode is not None: + oldmask = os.umask(0) + try: + os.chmod(tmp.name, self.fmode) + finally: + os.umask(oldmask) + return tmp + except OSError as ose: + if "Too many open files" in str(ose): + warn_string = ( + f"FileHashStore - mktmpfile: {ose}." + + " Retrying after {wait_delay} seconds." + ) + logging.warning(warn_string) + attempt += 1 + time.sleep(wait_delay) + else: + # If the error is not related to "Too many open files", raise it + exception_string = f"FileHashStore - mktmpfile: {ose}" + logging.error(exception_string) + raise ose def _write_cid_refs_file(self, path, pid): """Write the CID reference file in the supplied path to a file. A reference file From 8959d175a3ba645a7c3d992b39bdb297c497d1bb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 3 Jan 2024 17:29:42 -0800 Subject: [PATCH 088/420] Refactor 'store_object' process to handle storing duplicate objects but with different pid references (instead of throwing an exception) and add new pytests --- src/hashstore/filehashstore.py | 35 ++++++++++--- tests/test_filehashstore.py | 7 ++- tests/test_filehashstore_interface.py | 72 +++++++++++++++++++++++++-- 3 files changed, 102 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e5ca77f5..a618f4a9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1054,14 +1054,29 @@ def _move_and_get_checksums( logging.error("FileHashStore - _move_and_get_checksums: %s", err_msg) raise else: - # Else delete temporary file - exception_string = ( - "FileHashStore - _move_and_get_checksums: Object already exists at:" - + f" {abs_file_path}, deleting temporary file." - ) - logging.error(exception_string) - self.delete(entity, tmp_file_name) - raise FileExistsError(exception_string) + # If the file exists, determine if the object is what the client states it to be + try: + self._validate_arg_object( + pid, + checksum, + checksum_algorithm, + entity, + hex_digests, + tmp_file_name, + tmp_file_size, + file_size_to_validate, + ) + except Exception as ge: + # If any exception is thrown during validation, + exception_string = ( + "FileHashStore - _move_and_get_checksums: Object exists but cannot be verified" + + f" (validation error): {abs_file_path}, deleting temporary file. Error: {ge}" + ) + logging.error(exception_string) + raise FileExistsError from ge + finally: + # Delete the temporary file, it already exists so it is redundant + self.delete(entity, tmp_file_name) return object_cid, tmp_file_size, hex_digests @@ -2061,6 +2076,10 @@ def count(self, entity): directory_to_count = self.objects elif entity == "metadata": directory_to_count = self.metadata + elif entity == "pid": + directory_to_count = self.refs + "/pid" + elif entity == "cid": + directory_to_count = self.refs + "/cid" else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index ce04ecec..fe309245 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -430,7 +430,12 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): input_stream = io.open(path, "rb") with pytest.raises(FileExistsError): # pylint: disable=W0212 - store._move_and_get_checksums(pid, input_stream) + store._move_and_get_checksums( + pid, + input_stream, + checksum="nonmatchingchecksum", + checksum_algorithm="sha256", + ) input_stream.close() assert store.count(entity) == 3 diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 32cf661a..6f418412 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -331,8 +331,72 @@ def test_store_object_checksum_incorrect_checksum(store): ) -def test_store_object_duplicate_raises_error(pids, store): - """Test store duplicate object throws FileExistsError.""" +def test_store_object_duplicate_does_not_store_duplicate(store): + """Test that storing duplicate object does not store object twice.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + entity = "objects" + # Store first blob + _object_metadata_one = store.store_object(pid, path) + # Store second blob + pid_that_refs_existing_cid = "dou.test.1" + _object_metadata_two = store.store_object(pid_that_refs_existing_cid, path) + # Confirm only one object exists and the tmp file created is deleted + assert store.count(entity) == 1 + + +def test_store_object_duplicate_references_files(pids, store): + """Test that storing duplicate object but different pid creates the expected + amount of reference files.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + # Store with first pid + _object_metadata_one = store.store_object(pid, path) + # Store with second pid + pid_two = "dou.test.1" + _object_metadata_two = store.store_object(pid_two, path) + # Store with third pid + pid_three = "dou.test.2" + _object_metadata_three = store.store_object(pid_three, path) + # Confirm that there are 3 pid reference files + assert store.count("pid") == 3 + # Confirm that there are 1 cid reference files + assert store.count("cid") == 1 + # Confirm the content of the cid refence files + cid_ref_abs_path = store.get_refs_abs_path("cid", pids[pid][store.algorithm]) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == pid_two or value == pid_three + + +def test_store_object_duplicate_references_content(pids, store): + """Test that storing duplicate object but different pid creates the expected + amount of reference files.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + # Store with first pid + store.store_object(pid, path) + # Store with second pid + pid_two = "dou.test.1" + store.store_object(pid_two, path) + # Store with third pid + pid_three = "dou.test.2" + store.store_object(pid_three, path) + # Confirm the content of the cid refence files + cid_ref_abs_path = store.get_refs_abs_path("cid", pids[pid][store.algorithm]) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == pid_two or value == pid_three + + +def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, store): + """Test store duplicate object throws FileExistsError when object exists + but the data to validate against is incorrect.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -341,7 +405,9 @@ def test_store_object_duplicate_raises_error(pids, store): _object_metadata_one = store.store_object(pid, path) # Store second blob with pytest.raises(FileExistsError): - _object_metadata_two = store.store_object(pid, path) + _object_metadata_two = store.store_object( + pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" + ) assert store.count(entity) == 1 assert store.exists(entity, pids[pid][store.algorithm]) From a0fc7d079f08a53aa2c0ebe588dbac08658504e1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 4 Jan 2024 09:18:57 -0800 Subject: [PATCH 089/420] Revise and add logging statements --- src/hashstore/filehashstore.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a618f4a9..8cc42e44 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -476,6 +476,13 @@ def store_object( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, ) + except Exception as err: + exception_string = ( + "FileHashStore - store_object: failed to store object." + + f" Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err finally: # Release pid with self.object_lock: @@ -931,7 +938,7 @@ def store_data_only(self, data): # pylint: disable=W0718 except Exception as err: exception_string = ( - "FileHashStore - store_object: failed to store object." + "FileHashStore - store_object (store_data_only): failed to store object." + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1067,7 +1074,7 @@ def _move_and_get_checksums( file_size_to_validate, ) except Exception as ge: - # If any exception is thrown during validation, + # If any exception is thrown during validation, exception_string = ( "FileHashStore - _move_and_get_checksums: Object exists but cannot be verified" + f" (validation error): {abs_file_path}, deleting temporary file. Error: {ge}" From f8e168b4a4264fc36c97b36cd01d0b66f7ec6479 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 4 Jan 2024 10:31:51 -0800 Subject: [PATCH 090/420] Revert changes to '_mktmpfile' by removing while-try code --- src/hashstore/filehashstore.py | 57 ++++++++++------------------------ 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8cc42e44..d5c054cc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,7 +478,7 @@ def store_object( ) except Exception as err: exception_string = ( - "FileHashStore - store_object: failed to store object." + f"FileHashStore - store_object: failed to store object for pid: {pid}." + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1182,53 +1182,30 @@ def _write_to_tmp_file_and_get_hex_digests( ) logging.error(exception_string) - def _mktmpfile(self, path, max_retry=10, wait_delay=60): + def _mktmpfile(self, path): """Create a temporary file at the given path ready to be written. :param str path: Path to the file location. :return: file object - object with a file-like interface. """ - # HashStore's primary purpose of storing objects can require the usage of - # many file descriptors in concurrency/parallel settings, causing the limit - # allowed by an OS to be quickly reached (and preventing files from being stored) - # We will re-try generating the temp file to minimize non-critical errors. - # - # To see the limit on Linux, run the command `ulimit -n` - attempt = 1 - while attempt <= max_retry: - try: - tmp = NamedTemporaryFile(dir=path, delete=False) + tmp = NamedTemporaryFile(dir=path, delete=False) - # Delete tmp file if python interpreter crashes or thread is interrupted - def delete_tmp_file(): - if os.path.exists(tmp.name): - os.remove(tmp.name) + # Delete tmp file if python interpreter crashes or thread is interrupted + def delete_tmp_file(): + if os.path.exists(tmp.name): + os.remove(tmp.name) - atexit.register(delete_tmp_file) - - # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) - try: - os.chmod(tmp.name, self.fmode) - finally: - os.umask(oldmask) - return tmp - except OSError as ose: - if "Too many open files" in str(ose): - warn_string = ( - f"FileHashStore - mktmpfile: {ose}." - + " Retrying after {wait_delay} seconds." - ) - logging.warning(warn_string) - attempt += 1 - time.sleep(wait_delay) - else: - # If the error is not related to "Too many open files", raise it - exception_string = f"FileHashStore - mktmpfile: {ose}" - logging.error(exception_string) - raise ose + atexit.register(delete_tmp_file) + + # Ensure tmp file is created with desired permissions + if self.fmode is not None: + oldmask = os.umask(0) + try: + os.chmod(tmp.name, self.fmode) + finally: + os.umask(oldmask) + return tmp def _write_cid_refs_file(self, path, pid): """Write the CID reference file in the supplied path to a file. A reference file From ac9c8b912893dc69e7e9d3826ab9ab822ef927db Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 4 Jan 2024 11:06:53 -0800 Subject: [PATCH 091/420] Refactor 'with open(...)' statements to explicitly call '.close()' in finally blocks --- src/hashstore/filehashstore.py | 101 ++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 40 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d5c054cc..f0c536e4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -143,11 +143,14 @@ def load_properties(self): ) logging.critical(exception_string) raise FileNotFoundError(exception_string) - # Open file - with open( - self.hashstore_configuration_yaml, "r", encoding="utf-8" - ) as hs_yaml_file: - yaml_data = yaml.safe_load(hs_yaml_file) + + try: + # Open file + with open( + self.hashstore_configuration_yaml, "r", encoding="utf-8" + ) as hs_yaml_file: + yaml_data = yaml.safe_load(hs_yaml_file) + finally: hs_yaml_file.close() # Get hashstore properties @@ -222,16 +225,18 @@ def write_properties(self, properties): store_metadata_namespace, ) # Write 'hashstore.yaml' - with open( - self.hashstore_configuration_yaml, "w", encoding="utf-8" - ) as hs_yaml_file: - hs_yaml_file.write(hashstore_configuration_yaml) + try: + with open( + self.hashstore_configuration_yaml, "w", encoding="utf-8" + ) as hs_yaml_file: + hs_yaml_file.write(hashstore_configuration_yaml) + logging.debug( + "FileHashStore - write_properties: Configuration file written to: %s", + self.hashstore_configuration_yaml, + ) + finally: hs_yaml_file.close() - logging.debug( - "FileHashStore - write_properties: Configuration file written to: %s", - self.hashstore_configuration_yaml, - ) return @staticmethod @@ -391,10 +396,13 @@ def lookup_algo(algo_to_translate): ) logging.critical(exception_string) raise FileNotFoundError(exception_string) - with open( - self.hashstore_configuration_yaml, "r", encoding="utf-8" - ) as hs_yaml_file: - yaml_data = yaml.safe_load(hs_yaml_file) + + try: + with open( + self.hashstore_configuration_yaml, "r", encoding="utf-8" + ) as hs_yaml_file: + yaml_data = yaml.safe_load(hs_yaml_file) + finally: hs_yaml_file.close() # Set default store algorithm @@ -641,8 +649,10 @@ def find_object(self, pid): raise FileNotFoundError(err_msg) else: # Read the file to get the cid from the pid reference - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - pid_refs_cid = pid_ref_file.read() + try: + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + pid_refs_cid = pid_ref_file.read() + finally: pid_ref_file.close() return pid_refs_cid @@ -1237,7 +1247,6 @@ def _write_cid_refs_file(self, path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - cid_ref_file.close() return except Exception as err: @@ -1248,6 +1257,9 @@ def _write_cid_refs_file(self, path, pid): logging.error(exception_string) raise err + finally: + cid_ref_file.close() + def _update_cid_refs(self, cid_ref_abs_path, pid): """Update an existing CID reference file with the given PID. @@ -1268,7 +1280,8 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): raise FileNotFoundError(exception_string) try: - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) for _, line in enumerate(cid_ref_file, start=1): value = line.strip() if pid == value: @@ -1280,13 +1293,11 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): cid_ref_file.close() return - with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - cid_ref_file.close() return except Exception as err: @@ -1297,6 +1308,9 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err + finally: + cid_ref_file.close() + def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): """Delete a PID from a CID reference file. @@ -1308,13 +1322,11 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): pid, cid_ref_abs_path, ) - try: - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + with open(cid_ref_abs_path, "r+", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) # Read the ref file to see if the pid is already referencing the cid cid_ref_file_content = cid_ref_file.read() - cid_ref_file.close() if pid not in cid_ref_file_content: err_msg = ( @@ -1322,14 +1334,15 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): + f" cid reference file: {cid_ref_abs_path} " ) raise ValueError(err_msg) - - with open(cid_ref_abs_path, "w", encoding="utf8") as cid_ref_file: - fcntl.flock(cid_ref_file, fcntl.LOCK_EX) - cid_ref_file.write(cid_ref_file_content.replace(pid + "\n", "")) - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) - cid_ref_file.close() + else: + # Move the file pointer to the beginning for writing + cid_ref_file.seek(0) + cid_ref_file.write(cid_ref_file_content.replace(pid + "\n", "")) + # Ensure file ends where the new content ends + cid_ref_file.truncate() + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) return except Exception as err: @@ -1340,6 +1353,9 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err + finally: + cid_ref_file.close() + def _delete_cid_refs_file(self, cid_ref_abs_path): """Delete a CID reference file. There must be no references remaining. @@ -1400,7 +1416,6 @@ def _write_pid_refs_file(self, path, cid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - pid_ref_file.close() return except Exception as err: @@ -1411,6 +1426,9 @@ def _write_pid_refs_file(self, path, cid): logging.error(exception_string) raise err + finally: + pid_ref_file.close() + def _delete_pid_refs_file(self, pid_ref_abs_path): """Delete a PID reference file. @@ -1730,12 +1748,15 @@ def _verify_hashstore_references(self, pid, cid): raise ValueError(exception_string) # Then the pid pid_found = False - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: - for _, line in enumerate(cid_ref_file, start=1): - value = line.strip() - if value == pid: - pid_found = True + try: + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + for _, line in enumerate(cid_ref_file, start=1): + value = line.strip() + if value == pid: + pid_found = True + finally: cid_ref_file.close() + if not pid_found: exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" From 23f4841fcd189d8b1a47c7753550963afe96887d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 5 Jan 2024 09:09:46 -0800 Subject: [PATCH 092/420] Refactor '_write_cid_refs_file' to return a tmp file with the expected content and update pytests --- src/hashstore/filehashstore.py | 31 +++--- tests/test_filehashstore_references.py | 129 +++++++++---------------- 2 files changed, 59 insertions(+), 101 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f0c536e4..14de7bab 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -603,10 +603,8 @@ def tag_object(self, pid, cid): pid_tmp_file = self._mktmpfile(tmp_root_path) pid_tmp_file_path = pid_tmp_file.name self._write_pid_refs_file(pid_tmp_file_path, cid) - # Then write cid_refs_file content into another tmp file - cid_tmp_file = self._mktmpfile(tmp_root_path) - cid_tmp_file_path = cid_tmp_file.name - self._write_cid_refs_file(cid_tmp_file_path, pid) + # Then get a cid_refs_file + cid_tmp_file_path = self._write_cid_refs_file(tmp_root_path, pid) # Create path for pid ref file in '.../refs/pid' self.create_path(os.path.dirname(pid_ref_abs_path)) @@ -1224,30 +1222,26 @@ def _write_cid_refs_file(self, path, pid): :param str path: Path of the file to be written into. :param str pid: Authority-based or persistent identifier of the object. + + :return: cid_tmp_file_path - Path to the cid tmp file """ logging.debug( "FileHashStore - write_cid_refs_file: Writing pid (%s) into file: %s", pid, path, ) - - if os.path.isfile(path): - if os.path.getsize(path) != 0: - err_msg = ( - "FileHashStore - _write_cid_refs_file: Failed to write cid reference file." - + f" File is not empty: {path} " - ) - logging.error(err_msg) - raise OSError(err_msg) + cid_tmp_file = self._mktmpfile(path) + cid_tmp_file_path = cid_tmp_file.name try: - with open(path, "w", encoding="utf8") as cid_ref_file: - fcntl.flock(cid_ref_file, fcntl.LOCK_EX) - cid_ref_file.write(pid + "\n") + with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: + fcntl.flock(tmp_cid_ref_file, fcntl.LOCK_EX) + tmp_cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - return + tmp_cid_ref_file.close() + return cid_tmp_file_path except Exception as err: exception_string = ( @@ -1257,9 +1251,6 @@ def _write_cid_refs_file(self, path, pid): logging.error(exception_string) raise err - finally: - cid_ref_file.close() - def _update_cid_refs(self, cid_ref_abs_path, pid): """Update an existing CID reference file with the given PID. diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index c4cb4788..c5be4021 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -1,5 +1,6 @@ """Test module for FileHashStore's reference system to tag stored objects.""" import os +import shutil import pytest # pylint: disable=W0212 @@ -244,63 +245,31 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) - assert os.path.exists(cid_ref_abs_path) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "test_pid") + assert os.path.exists(tmp_cid_refs_file) def test_write_cid_refs_file_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) - - with open(cid_ref_abs_path, "r", encoding="utf8") as f: + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + with open(tmp_cid_refs_file, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() assert pid == cid_ref_file_pid.strip() -def test_write_cid_refs_file_into_empty_file(pids, store): - """Test that write_cid_reference writes an empty file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - with open(cid_ref_abs_path, "w", encoding="utf8"): - pass - store._write_cid_refs_file(cid_ref_abs_path, pid) - assert os.path.exists(cid_ref_abs_path) - - -def test_write_cid_refs_file_file_not_empty(pids, store): - """Test that write_cid_reference does not overwrite an existing file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) - with pytest.raises(OSError): - store._write_cid_refs_file(cid_ref_abs_path, "other_pid") - - def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) - + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) pid_other = "dou.test.1" - store._update_cid_refs(cid_ref_abs_path, pid_other) + store._update_cid_refs(tmp_cid_refs_file, pid_other) - with open(cid_ref_abs_path, "r", encoding="utf8") as f: + with open(tmp_cid_refs_file, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() assert value == pid or value == pid_other @@ -309,18 +278,16 @@ def test_update_cid_refs_content(pids, store): def test_update_cid_refs_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) cid_reference_list = [pid] for i in range(0, 5): - store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + store._update_cid_refs(tmp_cid_refs_file, f"dou.test.{i}") cid_reference_list.append(f"dou.test.{i}") line_count = 0 - with open(cid_ref_abs_path, "r", encoding="utf8") as f: + with open(tmp_cid_refs_file, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): line_count += 1 value = line.strip() @@ -333,12 +300,10 @@ def test_update_cid_refs_content_pid_exists(pids, store): """Test that update_cid_ref does not throw exception if pid already exists and proceeds to complete the tagging process (verify_object)""" for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) # Exception should not be thrown - store._update_cid_refs(cid_ref_abs_path, pid) + store._update_cid_refs(tmp_cid_refs_file, pid) def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): @@ -353,16 +318,14 @@ def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) pid_other = "dou.test.1" - store._update_cid_refs(cid_ref_abs_path, pid_other) - store._delete_cid_refs_pid(cid_ref_abs_path, pid) + store._update_cid_refs(tmp_cid_refs_file, pid_other) + store._delete_cid_refs_pid(tmp_cid_refs_file, pid) - with open(cid_ref_abs_path, "r", encoding="utf8") as f: + with open(tmp_cid_refs_file, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() print(value) @@ -372,39 +335,34 @@ def test_delete_cid_refs_pid(pids, store): def test_delete_cid_refs_pid_pid_not_found(pids, store): """Test that delete_cid_refs_pid raises exception when pid not found.""" for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) pid_other = "dou.test.1" - store._update_cid_refs(cid_ref_abs_path, pid_other) + store._update_cid_refs(tmp_cid_refs_file, pid_other) with pytest.raises(ValueError): - store._delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") + store._delete_cid_refs_pid(tmp_cid_refs_file, "dou.not.found.1") def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_file deletes a reference file.""" for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) - store._delete_cid_refs_pid(cid_ref_abs_path, pid) - cid_refs_deleted = store._delete_cid_refs_file(cid_ref_abs_path) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + # First remove the pid + store._delete_cid_refs_pid(tmp_cid_refs_file, pid) + cid_refs_deleted = store._delete_cid_refs_file(tmp_cid_refs_file) assert cid_refs_deleted - assert not os.path.exists(cid_ref_abs_path) + assert not os.path.exists(tmp_cid_refs_file) def test_delete_cid_refs_file_file_not_empty(pids, store): """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, pid) - is_cid_refs_file_deleted = store._delete_cid_refs_file(cid_ref_abs_path) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + is_cid_refs_file_deleted = store._delete_cid_refs_file(tmp_cid_refs_file) assert not is_cid_refs_file_deleted @@ -496,12 +454,17 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): the expected pid.""" for pid in pids.keys(): cid = pids[pid]["sha256"] + # Get a tmp cid refs file and write the wrong pid into it + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") + # Move it to its permanent address cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(tmp_cid_refs_file, cid_ref_abs_path) + # Now write the pid ref, both cid and pid refs must be present pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store._write_pid_refs_file(pid_ref_abs_path, cid) - store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") with pytest.raises(ValueError): store._verify_hashstore_references(pid, cid) @@ -513,13 +476,17 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi references does not contain the expected pid.""" for pid in pids.keys(): cid = pids[pid]["sha256"] + # Get a tmp cid refs file and write the wrong pid into it + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") + # Move it to its permanent address cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(tmp_cid_refs_file, cid_ref_abs_path) + # Now write the pid ref pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store._write_pid_refs_file(pid_ref_abs_path, cid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") cid_reference_list = [pid] for i in range(0, 5): From 5fcfdb863c386efa6974de1d02e86113278b2377 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 5 Jan 2024 09:28:29 -0800 Subject: [PATCH 093/420] Refactor '_write_pid_refs_file' to return a tmp file with the expected content and update pytests --- src/hashstore/filehashstore.py | 35 ++++++--------- tests/test_filehashstore_references.py | 61 +++++++++++++++----------- 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 14de7bab..2cbe8b0d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -581,10 +581,7 @@ def tag_object(self, pid, cid): raise FileExistsError(exception_string) elif os.path.exists(cid_ref_abs_path): # Create the pid refs file - pid_tmp_file = self._mktmpfile(tmp_root_path) - pid_tmp_file_path = pid_tmp_file.name - self._write_pid_refs_file(pid_tmp_file_path, cid) - # Create path for pid ref file in '.../refs/pid' + pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) self.create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files if it already exists @@ -599,16 +596,11 @@ def tag_object(self, pid, cid): return True else: # All ref files begin as tmp files and get moved sequentially at once - # Then write pid_refs_file content into tmp file - pid_tmp_file = self._mktmpfile(tmp_root_path) - pid_tmp_file_path = pid_tmp_file.name - self._write_pid_refs_file(pid_tmp_file_path, cid) - # Then get a cid_refs_file + # Get tmp files with the expected cid and pid refs content + pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) cid_tmp_file_path = self._write_cid_refs_file(tmp_root_path, pid) - - # Create path for pid ref file in '.../refs/pid' + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' self.create_path(os.path.dirname(pid_ref_abs_path)) - # Create path for cid ref file in '.../refs/cid' self.create_path(os.path.dirname(cid_ref_abs_path)) # Move both files shutil.move(pid_tmp_file_path, pid_ref_abs_path) @@ -1230,10 +1222,9 @@ def _write_cid_refs_file(self, path, pid): pid, path, ) - cid_tmp_file = self._mktmpfile(path) - cid_tmp_file_path = cid_tmp_file.name - try: + cid_tmp_file = self._mktmpfile(path) + cid_tmp_file_path = cid_tmp_file.name with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: fcntl.flock(tmp_cid_ref_file, fcntl.LOCK_EX) tmp_cid_ref_file.write(pid + "\n") @@ -1399,27 +1390,26 @@ def _write_pid_refs_file(self, path, cid): cid, path, ) - try: - with open(path, "w", encoding="utf8") as pid_ref_file: + pid_tmp_file = self._mktmpfile(path) + pid_tmp_file_path = pid_tmp_file.name + with open(pid_tmp_file_path, "w", encoding="utf8") as pid_ref_file: fcntl.flock(pid_ref_file, fcntl.LOCK_EX) pid_ref_file.write(cid) # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - return + pid_ref_file.close() + return pid_tmp_file_path except Exception as err: exception_string = ( f"FileHashStore - _write_pid_refs_file: failed to write cid ({cid})" - + f" into path: {path}. Unexpected {err=}, {type(err)=}" + + f" in : {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - finally: - pid_ref_file.close() - def _delete_pid_refs_file(self, pid_ref_abs_path): """Delete a PID reference file. @@ -1745,6 +1735,7 @@ def _verify_hashstore_references(self, pid, cid): value = line.strip() if value == pid: pid_found = True + cid_ref_file.close() finally: cid_ref_file.close() diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index c5be4021..aec5ca81 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -337,7 +337,6 @@ def test_delete_cid_refs_pid_pid_not_found(pids, store): for pid in pids.keys(): tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) - pid_other = "dou.test.1" store._update_cid_refs(tmp_cid_refs_file, pid_other) with pytest.raises(ValueError): @@ -379,21 +378,18 @@ def test_write_pid_refs_file(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store._write_pid_refs_file(pid_ref_abs_path, cid) - assert os.path.exists(pid_ref_abs_path) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + assert os.path.exists(tmp_pid_refs_file) def test_write_pid_refs_file_content(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store._write_pid_refs_file(pid_ref_abs_path, cid) - - with open(pid_ref_abs_path, "r", encoding="utf8") as f: + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + with open(tmp_pid_refs_file, "r", encoding="utf8") as f: pid_refs_cid = f.read() assert cid == pid_refs_cid @@ -403,12 +399,11 @@ def test_delete_pid_refs_file(pids, store): """Test that delete_pid_refs_file deletes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store._write_pid_refs_file(pid_ref_abs_path, cid) - store._delete_pid_refs_file(pid_ref_abs_path) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + store._delete_pid_refs_file(tmp_pid_refs_file) - assert not os.path.exists(pid_ref_abs_path) + assert not os.path.exists(tmp_pid_refs_file) def test_delete_pid_refs_file_file_not_found(pids, store): @@ -431,10 +426,20 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): """Test _verify_hashstore_references throws exception when pid refs file cid is incorrect.""" for pid in pids.keys(): cid = pids[pid]["sha256"] + # Write the cid refs file and move it where it needs to be + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(tmp_cid_refs_file, cid_ref_abs_path) + # Write the pid refs file and move it where it needs to be with a bad cid pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store._write_pid_refs_file(pid_ref_abs_path, "bad_cid") - with pytest.raises(FileNotFoundError): + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + + with pytest.raises(ValueError): store._verify_hashstore_references(pid, cid) @@ -444,7 +449,10 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store._write_pid_refs_file(pid_ref_abs_path, cid) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + with pytest.raises(FileNotFoundError): store._verify_hashstore_references(pid, cid) @@ -457,14 +465,16 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): # Get a tmp cid refs file and write the wrong pid into it tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - # Move it to its permanent address cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) - # Now write the pid ref, both cid and pid refs must be present + # Now write the pid refs file, both cid and pid refs must be present pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store._write_pid_refs_file(pid_ref_abs_path, cid) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + with pytest.raises(ValueError): store._verify_hashstore_references(pid, cid) @@ -476,17 +486,18 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi references does not contain the expected pid.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - # Get a tmp cid refs file and write the wrong pid into it + # Write the wrong pid into a cid refs file and move it where it needs to be tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - # Move it to its permanent address cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) - # Now write the pid ref + # Now write the pid refs with expected values pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store._write_pid_refs_file(pid_ref_abs_path, cid) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) cid_reference_list = [pid] for i in range(0, 5): From e988b257e7b98d0842dc13d49d6402ef9067a9f8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 5 Jan 2024 09:36:40 -0800 Subject: [PATCH 094/420] Revert try-finally blocks and clean up code --- src/hashstore/filehashstore.py | 78 ++++++++++---------------- tests/test_filehashstore_references.py | 2 +- 2 files changed, 32 insertions(+), 48 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2cbe8b0d..cfb08d3d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -144,13 +144,11 @@ def load_properties(self): logging.critical(exception_string) raise FileNotFoundError(exception_string) - try: - # Open file - with open( - self.hashstore_configuration_yaml, "r", encoding="utf-8" - ) as hs_yaml_file: - yaml_data = yaml.safe_load(hs_yaml_file) - finally: + # Open file + with open( + self.hashstore_configuration_yaml, "r", encoding="utf-8" + ) as hs_yaml_file: + yaml_data = yaml.safe_load(hs_yaml_file) hs_yaml_file.close() # Get hashstore properties @@ -225,18 +223,16 @@ def write_properties(self, properties): store_metadata_namespace, ) # Write 'hashstore.yaml' - try: - with open( - self.hashstore_configuration_yaml, "w", encoding="utf-8" - ) as hs_yaml_file: - hs_yaml_file.write(hashstore_configuration_yaml) - logging.debug( - "FileHashStore - write_properties: Configuration file written to: %s", - self.hashstore_configuration_yaml, - ) - finally: + with open( + self.hashstore_configuration_yaml, "w", encoding="utf-8" + ) as hs_yaml_file: + hs_yaml_file.write(hashstore_configuration_yaml) hs_yaml_file.close() + logging.debug( + "FileHashStore - write_properties: Configuration file written to: %s", + self.hashstore_configuration_yaml, + ) return @staticmethod @@ -397,12 +393,10 @@ def lookup_algo(algo_to_translate): logging.critical(exception_string) raise FileNotFoundError(exception_string) - try: - with open( - self.hashstore_configuration_yaml, "r", encoding="utf-8" - ) as hs_yaml_file: - yaml_data = yaml.safe_load(hs_yaml_file) - finally: + with open( + self.hashstore_configuration_yaml, "r", encoding="utf-8" + ) as hs_yaml_file: + yaml_data = yaml.safe_load(hs_yaml_file) hs_yaml_file.close() # Set default store algorithm @@ -570,8 +564,6 @@ def tag_object(self, pid, cid): # Proceed to tagging process if os.path.exists(pid_ref_abs_path): - print("Path exists:\n") - print(pid_ref_abs_path) # A pid reference file can only contain one cid exception_string = ( "FileHashStore - write_pid_refs_file: pid ref file already exists for" @@ -579,14 +571,14 @@ def tag_object(self, pid, cid): ) logging.error(exception_string) raise FileExistsError(exception_string) + elif os.path.exists(cid_ref_abs_path): # Create the pid refs file pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) self.create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) - # Update cid ref files if it already exists + # Update cid ref files as it already exists self._update_cid_refs(cid_ref_abs_path, pid) - # Verify refs file content self._verify_hashstore_references(pid, cid) logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", @@ -594,6 +586,7 @@ def tag_object(self, pid, cid): pid, ) return True + else: # All ref files begin as tmp files and get moved sequentially at once # Get tmp files with the expected cid and pid refs content @@ -639,10 +632,8 @@ def find_object(self, pid): raise FileNotFoundError(err_msg) else: # Read the file to get the cid from the pid reference - try: - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - pid_refs_cid = pid_ref_file.read() - finally: + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + pid_refs_cid = pid_ref_file.read() pid_ref_file.close() return pid_refs_cid @@ -770,8 +761,7 @@ def delete_object(self, pid): # Remove pid from cid reference file cid_ref_abs_path = self.get_refs_abs_path("cid", cid) self._delete_cid_refs_pid(cid_ref_abs_path, pid) - # Delete cid reference file - # If the file is not empty, it will not be deleted. + # Delete cid reference file, if the file is not empty, it will not be deleted. cid_refs_deleted = self._delete_cid_refs_file(cid_ref_abs_path) # Delete pid reference file pid_ref_abs_path = self.get_refs_abs_path("pid", pid) @@ -1277,6 +1267,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): fcntl.flock(cid_ref_file, fcntl.LOCK_EX) cid_ref_file.write(pid + "\n") + cid_ref_file.close() # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) @@ -1290,9 +1281,6 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - finally: - cid_ref_file.close() - def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): """Delete a PID from a CID reference file. @@ -1311,6 +1299,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): cid_ref_file_content = cid_ref_file.read() if pid not in cid_ref_file_content: + cid_ref_file.close() err_msg = ( f"FileHashStore - _delete_cid_refs_pid: pid ({pid}) does not exist in" + f" cid reference file: {cid_ref_abs_path} " @@ -1325,6 +1314,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) + cid_ref_file.close() return except Exception as err: @@ -1335,9 +1325,6 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - finally: - cid_ref_file.close() - def _delete_cid_refs_file(self, cid_ref_abs_path): """Delete a CID reference file. There must be no references remaining. @@ -1729,14 +1716,11 @@ def _verify_hashstore_references(self, pid, cid): raise ValueError(exception_string) # Then the pid pid_found = False - try: - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: - for _, line in enumerate(cid_ref_file, start=1): - value = line.strip() - if value == pid: - pid_found = True - cid_ref_file.close() - finally: + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + for _, line in enumerate(cid_ref_file, start=1): + value = line.strip() + if value == pid: + pid_found = True cid_ref_file.close() if not pid_found: diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index aec5ca81..e0685982 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -243,7 +243,7 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): store.verify_object(object_metadata, checksum, "md2", expected_file_size) -def test_write_cid_refs_file(pids, store): +def test_write_cid_refs_file(store): """Test that write_cid_reference writes a reference file.""" tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "test_pid") From 3fff293a5f7ad31d9fe53935a5bcf36f3068baff Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 5 Jan 2024 10:23:22 -0800 Subject: [PATCH 095/420] Clean up logging and exception statements --- src/hashstore/filehashstore.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index cfb08d3d..bd4d6717 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -571,7 +571,7 @@ def tag_object(self, pid, cid): ) logging.error(exception_string) raise FileExistsError(exception_string) - + elif os.path.exists(cid_ref_abs_path): # Create the pid refs file pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) @@ -586,7 +586,7 @@ def tag_object(self, pid, cid): pid, ) return True - + else: # All ref files begin as tmp files and get moved sequentially at once # Get tmp files with the expected cid and pid refs content @@ -1206,6 +1206,7 @@ def _write_cid_refs_file(self, path, pid): :param str pid: Authority-based or persistent identifier of the object. :return: cid_tmp_file_path - Path to the cid tmp file + :rtype: string """ logging.debug( "FileHashStore - write_cid_refs_file: Writing pid (%s) into file: %s", @@ -1226,8 +1227,8 @@ def _write_cid_refs_file(self, path, pid): except Exception as err: exception_string = ( - f"FileHashStore - write_cid_refs_file: failed to write pid ({pid})" - + f" into path: {path}. Unexpected {err=}, {type(err)=}" + "FileHashStore - write_cid_refs_file: failed to write cid refs file for pid:" + + f" {pid} into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err @@ -1319,7 +1320,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - _delete_cid_refs_pid: failed to update reference for cid:" + "FileHashStore - _delete_cid_refs_pid: failed to remove pid from cid refs file:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1359,18 +1360,21 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): except Exception as err: exception_string = ( - "FileHashStore - _delete_cid_refs_file: failed to delete reference file:" + "FileHashStore - _delete_cid_refs_file: failed to delete cid refs file:" + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err def _write_pid_refs_file(self, path, cid): - """Write the PID reference file in the supplied path for the given CID (content + """Generate a tmp pid refs file into the given path for the given CID (content identifier). A reference file for a PID contains the CID that it references. :param str path: Path of the file to be written into. :param str cid: Content identifier. + + :return: pid_tmp_file_path + :rtype: string """ logging.debug( "FileHashStore - _write_pid_refs_file: Writing cid (%s) into file: %s", @@ -1392,7 +1396,7 @@ def _write_pid_refs_file(self, path, cid): except Exception as err: exception_string = ( f"FileHashStore - _write_pid_refs_file: failed to write cid ({cid})" - + f" in : {path}. Unexpected {err=}, {type(err)=}" + + f" into pid refs file: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err @@ -1420,7 +1424,7 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): except Exception as err: exception_string = ( - "FileHashStore - _delete_pid_refs_file: failed to delete reference file:" + "FileHashStore - _delete_pid_refs_file: failed to delete pid refs file:" + f" {pid_ref_abs_path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) From e26f49de68bbf89230c007e4228d5769e1007763 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 5 Jan 2024 11:45:03 -0800 Subject: [PATCH 096/420] Remove redundant file lock --- src/hashstore/filehashstore.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bd4d6717..4d498cf9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1266,7 +1266,6 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): cid_ref_file.close() return - fcntl.flock(cid_ref_file, fcntl.LOCK_EX) cid_ref_file.write(pid + "\n") cid_ref_file.close() # The context manager will take care of releasing the lock From 26b59c68ebee1083d8d5b31860bca9a4818afa40 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 5 Jan 2024 12:05:02 -0800 Subject: [PATCH 097/420] Temporarily disable file locks --- src/hashstore/filehashstore.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4d498cf9..7b9701f6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1217,7 +1217,7 @@ def _write_cid_refs_file(self, path, pid): cid_tmp_file = self._mktmpfile(path) cid_tmp_file_path = cid_tmp_file.name with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: - fcntl.flock(tmp_cid_ref_file, fcntl.LOCK_EX) + # fcntl.flock(tmp_cid_ref_file, fcntl.LOCK_EX) tmp_cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below @@ -1254,7 +1254,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): try: with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: - fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + # fcntl.flock(cid_ref_file, fcntl.LOCK_EX) for _, line in enumerate(cid_ref_file, start=1): value = line.strip() if pid == value: @@ -1294,7 +1294,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): ) try: with open(cid_ref_abs_path, "r+", encoding="utf8") as cid_ref_file: - fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + # fcntl.flock(cid_ref_file, fcntl.LOCK_EX) # Read the ref file to see if the pid is already referencing the cid cid_ref_file_content = cid_ref_file.read() @@ -1384,7 +1384,7 @@ def _write_pid_refs_file(self, path, cid): pid_tmp_file = self._mktmpfile(path) pid_tmp_file_path = pid_tmp_file.name with open(pid_tmp_file_path, "w", encoding="utf8") as pid_ref_file: - fcntl.flock(pid_ref_file, fcntl.LOCK_EX) + # fcntl.flock(pid_ref_file, fcntl.LOCK_EX) pid_ref_file.write(cid) # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below From 192ba0e782d0747f4111c2dd296dfceb094f46d4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 5 Jan 2024 12:13:36 -0800 Subject: [PATCH 098/420] Refactor '_mktmpfile' to create the given path if it doesn't exist yet --- src/hashstore/filehashstore.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7b9701f6..c1c7e8cc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -559,8 +559,6 @@ def tag_object(self, pid, cid): cid_ref_abs_path = self.get_refs_abs_path("cid", cid) # Ensure refs tmp folder exists tmp_root_path = self.get_store_path("refs") / "tmp" - if os.path.exists(tmp_root_path) is False: - self.create_path(tmp_root_path) # Proceed to tagging process if os.path.exists(pid_ref_abs_path): @@ -1097,11 +1095,7 @@ def _write_to_tmp_file_and_get_hex_digests( algorithm_list_to_calculate = self._refine_algorithm_list( additional_algorithm, checksum_algorithm ) - - tmp_root_path = self.get_store_path("objects") / "tmp" - # Physically create directory if it doesn't exist - if os.path.exists(tmp_root_path) is False: - self.create_path(tmp_root_path) + tmp_root_path = self.get_store_path("metadata") / "tmp" tmp = self._mktmpfile(tmp_root_path) logging.debug( @@ -1179,6 +1173,10 @@ def _mktmpfile(self, path): :return: file object - object with a file-like interface. """ + # Physically create directory if it doesn't exist + if os.path.exists(path) is False: + self.create_path(path) + tmp = NamedTemporaryFile(dir=path, delete=False) # Delete tmp file if python interpreter crashes or thread is interrupted @@ -1496,10 +1494,6 @@ def _mktmpmetadata(self, stream): """ # Create temporary file in .../{store_path}/tmp tmp_root_path = self.get_store_path("metadata") / "tmp" - # Physically create directory if it doesn't exist - if os.path.exists(tmp_root_path) is False: - self.create_path(tmp_root_path) - tmp = self._mktmpfile(tmp_root_path) # tmp is a file-like object that is already opened for writing by default From f0994465e01dda28da649007fe52c4686318eb71 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 5 Jan 2024 13:09:35 -0800 Subject: [PATCH 099/420] Refactor '_verify_hashstore_references' to improve logging statements and update pytests --- src/hashstore/filehashstore.py | 19 ++++++++++++------- tests/test_filehashstore_references.py | 10 +++++----- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c1c7e8cc..8aed6229 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -577,7 +577,7 @@ def tag_object(self, pid, cid): shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists self._update_cid_refs(cid_ref_abs_path, pid) - self._verify_hashstore_references(pid, cid) + self._verify_hashstore_references(pid, cid, "update") logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", cid, @@ -598,7 +598,7 @@ def tag_object(self, pid, cid): shutil.move(cid_tmp_file_path, cid_ref_abs_path) # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review - self._verify_hashstore_references(pid, cid) + self._verify_hashstore_references(pid, cid, "create") logging.info( "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", @@ -1677,27 +1677,30 @@ def _validate_arg_format_id(self, format_id, method): checked_format_id = format_id return checked_format_id - def _verify_hashstore_references(self, pid, cid): + def _verify_hashstore_references(self, pid, cid, verify_type): """Verifies that the supplied pid and pid reference file and content have been written successfully. :param str pid: Authority-based or persistent identifier. :param str cid: Content identifier. + :param str verify_type: "update" or "create" """ # Check that reference files were created pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if not os.path.exists(pid_ref_abs_path): exception_string = ( - "FileHashStore - _verify_hashstore_references: Pid refs file missing: %s", - pid_ref_abs_path, + "FileHashStore - _verify_hashstore_references: Pid refs file missing: " + + pid_ref_abs_path + + f" . Verify type {verify_type}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) if not os.path.exists(cid_ref_abs_path): exception_string = ( - "FileHashStore - _verify_hashstore_references: Cid refs file missing: %s", - cid_ref_abs_path, + "FileHashStore - _verify_hashstore_references: Cid refs file missing: " + + cid_ref_abs_path + + f" . Verify type {verify_type}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) @@ -1708,6 +1711,7 @@ def _verify_hashstore_references(self, pid, cid): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file exists" + f" ({pid_ref_abs_path}) but cid ({cid}) does not match." + + f"Verify type {verify_type}" ) logging.error(exception_string) raise ValueError(exception_string) @@ -1724,6 +1728,7 @@ def _verify_hashstore_references(self, pid, cid): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" + f" ({cid_ref_abs_path}) but pid ({pid}) not found." + + f"Verify type {verify_type}" ) logging.error(exception_string) raise ValueError(exception_string) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index e0685982..7d426d79 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -419,7 +419,7 @@ def test_verify_hashstore_references_pid_refs_file_missing(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] with pytest.raises(FileNotFoundError): - store._verify_hashstore_references(pid, cid) + store._verify_hashstore_references(pid, cid, "create") def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): @@ -440,7 +440,7 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(ValueError): - store._verify_hashstore_references(pid, cid) + store._verify_hashstore_references(pid, cid, "create") def test_verify_hashstore_references_cid_refs_file_missing(pids, store): @@ -454,7 +454,7 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(FileNotFoundError): - store._verify_hashstore_references(pid, cid) + store._verify_hashstore_references(pid, cid, "create") def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): @@ -476,7 +476,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(ValueError): - store._verify_hashstore_references(pid, cid) + store._verify_hashstore_references(pid, cid, "create") def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pid( @@ -505,4 +505,4 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi cid_reference_list.append(f"dou.test.{i}") with pytest.raises(ValueError): - store._verify_hashstore_references(pid, cid) + store._verify_hashstore_references(pid, cid, "create") From 9023aeee9e74f365511ead6f5b7fe45fe4251836 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 8 Jan 2024 09:36:50 -0800 Subject: [PATCH 100/420] Refactor pid and cid refs write methods to call .flush() to immediately write to file --- src/hashstore/filehashstore.py | 43 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8aed6229..72517088 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1212,16 +1212,17 @@ def _write_cid_refs_file(self, path, pid): path, ) try: - cid_tmp_file = self._mktmpfile(path) - cid_tmp_file_path = cid_tmp_file.name - with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: + with self._mktmpfile(path) as cid_tmp_file: # fcntl.flock(tmp_cid_ref_file, fcntl.LOCK_EX) - tmp_cid_ref_file.write(pid + "\n") + cid_tmp_file_path = cid_tmp_file.name + with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: + tmp_cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - tmp_cid_ref_file.close() - return cid_tmp_file_path + # Ensure that file is immediately written to and not held in memory + tmp_cid_ref_file.flush() + return cid_tmp_file_path except Exception as err: exception_string = ( @@ -1253,7 +1254,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): try: with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: # fcntl.flock(cid_ref_file, fcntl.LOCK_EX) - for _, line in enumerate(cid_ref_file, start=1): + for line in cid_ref_file: value = line.strip() if pid == value: warning_msg = ( @@ -1265,10 +1266,11 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): return cid_ref_file.write(pid + "\n") - cid_ref_file.close() # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) + # Ensure that file is immediately written to and not held in memory + cid_ref_file.flush() return except Exception as err: @@ -1297,7 +1299,6 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): cid_ref_file_content = cid_ref_file.read() if pid not in cid_ref_file_content: - cid_ref_file.close() err_msg = ( f"FileHashStore - _delete_cid_refs_pid: pid ({pid}) does not exist in" + f" cid reference file: {cid_ref_abs_path} " @@ -1312,7 +1313,8 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - cid_ref_file.close() + # Ensure that file is immediately written to and not held in memory + cid_ref_file.flush() return except Exception as err: @@ -1379,16 +1381,17 @@ def _write_pid_refs_file(self, path, cid): path, ) try: - pid_tmp_file = self._mktmpfile(path) - pid_tmp_file_path = pid_tmp_file.name - with open(pid_tmp_file_path, "w", encoding="utf8") as pid_ref_file: - # fcntl.flock(pid_ref_file, fcntl.LOCK_EX) - pid_ref_file.write(cid) - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) - pid_ref_file.close() - return pid_tmp_file_path + with self._mktmpfile(path) as pid_tmp_file: + pid_tmp_file_path = pid_tmp_file.name + with open(pid_tmp_file_path, "w", encoding="utf8") as pid_ref_file: + # fcntl.flock(pid_ref_file, fcntl.LOCK_EX) + pid_ref_file.write(cid) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + # Ensure that file is immediately written to and not held in memory + pid_ref_file.flush() + return pid_tmp_file_path except Exception as err: exception_string = ( From 5af7b082b6e2dfdb7cef791317e3df67f55065fc Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 8 Jan 2024 09:48:07 -0800 Subject: [PATCH 101/420] Fix incorrect indentation in '_write_cid_refs_file' method --- src/hashstore/filehashstore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 72517088..69cb58cc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1217,11 +1217,11 @@ def _write_cid_refs_file(self, path, pid): cid_tmp_file_path = cid_tmp_file.name with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: tmp_cid_ref_file.write(pid + "\n") - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) - # Ensure that file is immediately written to and not held in memory - tmp_cid_ref_file.flush() + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + # Ensure that file is immediately written to and not held in memory + tmp_cid_ref_file.flush() return cid_tmp_file_path except Exception as err: From f5e7db5c3183f98cbf62d180bdcc762439a8e66e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 10 Jan 2024 09:25:00 -0800 Subject: [PATCH 102/420] Remove '.flush' calls and move return calls in refs rw methods within the with block --- src/hashstore/filehashstore.py | 35 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 69cb58cc..b9a11fc0 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1213,16 +1213,12 @@ def _write_cid_refs_file(self, path, pid): ) try: with self._mktmpfile(path) as cid_tmp_file: - # fcntl.flock(tmp_cid_ref_file, fcntl.LOCK_EX) cid_tmp_file_path = cid_tmp_file.name with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: tmp_cid_ref_file.write(pid + "\n") - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) # Ensure that file is immediately written to and not held in memory - tmp_cid_ref_file.flush() - return cid_tmp_file_path + # tmp_cid_ref_file.flush() + return cid_tmp_file_path except Exception as err: exception_string = ( @@ -1253,7 +1249,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): try: with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: - # fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + # Confirm that pid is not currently already tagged for line in cid_ref_file: value = line.strip() if pid == value: @@ -1262,16 +1258,15 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): + f" cid reference file: {cid_ref_abs_path} " ) logging.warning(warning_msg) - cid_ref_file.close() return - + # Lock file for the shortest amount of time possible + file_descriptor = cid_ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - # Ensure that file is immediately written to and not held in memory - cid_ref_file.flush() - return + return except Exception as err: exception_string = ( @@ -1305,6 +1300,9 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): ) raise ValueError(err_msg) else: + # Lock file for the shortest amount of time possible + file_descriptor = cid_ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) # Move the file pointer to the beginning for writing cid_ref_file.seek(0) cid_ref_file.write(cid_ref_file_content.replace(pid + "\n", "")) @@ -1313,10 +1311,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - # Ensure that file is immediately written to and not held in memory - cid_ref_file.flush() - return - + return except Exception as err: exception_string = ( "FileHashStore - _delete_cid_refs_pid: failed to remove pid from cid refs file:" @@ -1384,14 +1379,8 @@ def _write_pid_refs_file(self, path, cid): with self._mktmpfile(path) as pid_tmp_file: pid_tmp_file_path = pid_tmp_file.name with open(pid_tmp_file_path, "w", encoding="utf8") as pid_ref_file: - # fcntl.flock(pid_ref_file, fcntl.LOCK_EX) pid_ref_file.write(cid) - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) - # Ensure that file is immediately written to and not held in memory - pid_ref_file.flush() - return pid_tmp_file_path + return pid_tmp_file_path except Exception as err: exception_string = ( From b174416bd55c645185ee5df8b36efb579b30ff1d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 10 Jan 2024 10:54:21 -0800 Subject: [PATCH 103/420] Fix bug RE: wrong entity type path for 'write_to_tmp....digests' method --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b9a11fc0..9b9ef537 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1095,7 +1095,7 @@ def _write_to_tmp_file_and_get_hex_digests( algorithm_list_to_calculate = self._refine_algorithm_list( additional_algorithm, checksum_algorithm ) - tmp_root_path = self.get_store_path("metadata") / "tmp" + tmp_root_path = self.get_store_path("objects") / "tmp" tmp = self._mktmpfile(tmp_root_path) logging.debug( From 60228ff52c2be54fad6139783042da6ea1d88b37 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 10 Jan 2024 11:05:39 -0800 Subject: [PATCH 104/420] Call .lower() on checksum variable in '_validate_arg_object' to ensure hashes are compared consistently --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9b9ef537..4c73582b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1624,7 +1624,7 @@ def _validate_arg_object( raise KeyError(exception_string) else: hex_digest_stored = hex_digests[checksum_algorithm] - if hex_digest_stored != checksum: + if hex_digest_stored != checksum.lower(): exception_string = ( "FileHashStore - _validate_arg_object: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" From f8be689e4b1f7777f85a6a21c42e40bb6e96afe5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 10 Jan 2024 11:11:28 -0800 Subject: [PATCH 105/420] Remove redundant return statements in refs related methods --- src/hashstore/filehashstore.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4c73582b..8ed98f3b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1258,6 +1258,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): + f" cid reference file: {cid_ref_abs_path} " ) logging.warning(warning_msg) + # Exit try statement, we do not want to write the pid return # Lock file for the shortest amount of time possible file_descriptor = cid_ref_file.fileno() @@ -1266,8 +1267,6 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - return - except Exception as err: exception_string = ( "FileHashStore - update_cid_refs: failed to update reference for cid:" @@ -1311,7 +1310,6 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - return except Exception as err: exception_string = ( "FileHashStore - _delete_cid_refs_pid: failed to remove pid from cid refs file:" @@ -1409,7 +1407,6 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): raise FileNotFoundError(err_msg) else: os.remove(pid_ref_abs_path) - return except Exception as err: exception_string = ( From 0aaf0faac0b16005231e30f5292979c78da85c68 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 10 Jan 2024 11:18:46 -0800 Subject: [PATCH 106/420] Fix bug in '_verify_hashstore_references' where for loop created a false negative when checking for a pid in a cid refs file --- src/hashstore/filehashstore.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8ed98f3b..d882a25b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1711,13 +1711,12 @@ def _verify_hashstore_references(self, pid, cid, verify_type): value = line.strip() if value == pid: pid_found = True - cid_ref_file.close() - + break if not pid_found: exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" + f" ({cid_ref_abs_path}) but pid ({pid}) not found." - + f"Verify type {verify_type}" + + f" Verify type {verify_type}" ) logging.error(exception_string) raise ValueError(exception_string) From 775e653e8c4d99c948208e979ea4163a5583a826 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 10 Jan 2024 16:59:54 -0800 Subject: [PATCH 107/420] Refactor 'delete_object' and '_delete_cid_refs_pid' to improve clarity and process --- src/hashstore/filehashstore.py | 59 +++++++++++++++++----------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d882a25b..f5cb0632 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -625,7 +625,7 @@ def find_object(self, pid): if not os.path.exists(pid_ref_abs_path): err_msg = ( f"FileHashStore - find_object: pid ({pid}) reference file not found: " - + pid_ref_abs_path, + + pid_ref_abs_path ) raise FileNotFoundError(err_msg) else: @@ -756,25 +756,30 @@ def delete_object(self, pid): ) self.reference_locked_cids.append(cid) try: - # Remove pid from cid reference file cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) + self._delete_pid_refs_file(pid_ref_abs_path) # Delete cid reference file, if the file is not empty, it will not be deleted. cid_refs_deleted = self._delete_cid_refs_file(cid_ref_abs_path) - # Delete pid reference file - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) - self._delete_pid_refs_file(pid_ref_abs_path) - # Finally, delete the object if cid_refs_deleted: + # If the cid reference file has been deleted, delete the actual object entity = "objects" self.delete(entity, cid) - - info_string = ( - "FileHashStore - delete_object: Successfully deleted references and/or" - + f" objects associated with pid: {pid}" - ) - logging.info(info_string) + info_string = ( + "FileHashStore - delete_object: Successfully deleted references and" + + f" object associated with pid: {pid}" + ) + logging.info(info_string) + else: + info_string = ( + "FileHashStore - delete_object: Successfully deleted pid refs file but" + + f" not object with cid ({cid}), cid refs file still has references." + ) + logging.info(info_string) return True + finally: # Release cid with self.reference_lock: @@ -1287,26 +1292,22 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): cid_ref_abs_path, ) try: - with open(cid_ref_abs_path, "r+", encoding="utf8") as cid_ref_file: - # fcntl.flock(cid_ref_file, fcntl.LOCK_EX) - # Read the ref file to see if the pid is already referencing the cid + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: cid_ref_file_content = cid_ref_file.read() - if pid not in cid_ref_file_content: - err_msg = ( - f"FileHashStore - _delete_cid_refs_pid: pid ({pid}) does not exist in" - + f" cid reference file: {cid_ref_abs_path} " - ) - raise ValueError(err_msg) - else: - # Lock file for the shortest amount of time possible + if pid not in cid_ref_file_content: + err_msg = ( + f"FileHashStore - _delete_cid_refs_pid: pid ({pid}) does not exist in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + else: + updated_content = cid_ref_file_content.replace(pid + "\n", "") + with open(cid_ref_abs_path, "w", encoding="utf8")as cid_ref_file: file_descriptor = cid_ref_file.fileno() + # Lock file for the shortest amount of time possible fcntl.flock(file_descriptor, fcntl.LOCK_EX) - # Move the file pointer to the beginning for writing - cid_ref_file.seek(0) - cid_ref_file.write(cid_ref_file_content.replace(pid + "\n", "")) - # Ensure file ends where the new content ends - cid_ref_file.truncate() + cid_ref_file.write(updated_content) # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) @@ -1341,7 +1342,7 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): raise FileNotFoundError(err_msg) if os.path.getsize(cid_ref_abs_path) != 0: err_msg = ( - "FileHashStore - _delete_cid_refs_file: Failed to delete cid reference file." + "FileHashStore - _delete_cid_refs_file: Did not delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " ) logging.warning(err_msg) From 7fd7a316f9c9a27423d74321977042fbb68cb53a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 11 Jan 2024 12:47:35 -0800 Subject: [PATCH 108/420] Refactor '_delete_cid_refs_pid' and update pytests --- src/hashstore/filehashstore.py | 35 ++++++++++++-------------- tests/test_filehashstore_references.py | 11 -------- 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f5cb0632..b2f6f0ee 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1292,25 +1292,22 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): cid_ref_abs_path, ) try: - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: - cid_ref_file_content = cid_ref_file.read() - - if pid not in cid_ref_file_content: - err_msg = ( - f"FileHashStore - _delete_cid_refs_pid: pid ({pid}) does not exist in" - + f" cid reference file: {cid_ref_abs_path} " - ) - raise ValueError(err_msg) - else: - updated_content = cid_ref_file_content.replace(pid + "\n", "") - with open(cid_ref_abs_path, "w", encoding="utf8")as cid_ref_file: - file_descriptor = cid_ref_file.fileno() - # Lock file for the shortest amount of time possible - fcntl.flock(file_descriptor, fcntl.LOCK_EX) - cid_ref_file.write(updated_content) - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) + with open(cid_ref_abs_path, "r+", encoding="utf8") as cid_ref_file: + # Lock file immediately, this process needs to complete + # before any others read/modify the content of cid_ref_file + file_descriptor = cid_ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) + new_pid_lines = [ + cid_pid_line + for cid_pid_line in cid_ref_file.readlines() + if cid_pid_line.strip() != pid + ] + cid_ref_file.seek(0) + cid_ref_file.truncate() + cid_ref_file.writelines(new_pid_lines) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) except Exception as err: exception_string = ( "FileHashStore - _delete_cid_refs_pid: failed to remove pid from cid refs file:" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 7d426d79..d1cebd8a 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -332,17 +332,6 @@ def test_delete_cid_refs_pid(pids, store): assert value == pid_other -def test_delete_cid_refs_pid_pid_not_found(pids, store): - """Test that delete_cid_refs_pid raises exception when pid not found.""" - for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) - pid_other = "dou.test.1" - store._update_cid_refs(tmp_cid_refs_file, pid_other) - with pytest.raises(ValueError): - store._delete_cid_refs_pid(tmp_cid_refs_file, "dou.not.found.1") - - def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_file deletes a reference file.""" for pid in pids.keys(): From 19799cba8b9c7e27100d39d6afcfdb4cd30630cf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 11 Jan 2024 12:48:48 -0800 Subject: [PATCH 109/420] Add debug logging statement to '_delete_cid_refs_file' --- src/hashstore/filehashstore.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b2f6f0ee..42e3d285 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1346,6 +1346,11 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): return False else: os.remove(cid_ref_abs_path) + debug_msg = ( + "FileHashStore - _delete_cid_refs_file: Deleted cid reference file." + + cid_ref_abs_path + ) + logging.debug(debug_msg) return True except Exception as err: From 3915b12e472b8d256cff5ef432ea56a419c5e53e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 15 Jan 2024 15:49:36 -0800 Subject: [PATCH 110/420] Clean up code --- src/hashstore/filehashstore.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 42e3d285..99c13668 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -557,7 +557,6 @@ def tag_object(self, pid, cid): try: pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) - # Ensure refs tmp folder exists tmp_root_path = self.get_store_path("refs") / "tmp" # Proceed to tagging process @@ -569,7 +568,6 @@ def tag_object(self, pid, cid): ) logging.error(exception_string) raise FileExistsError(exception_string) - elif os.path.exists(cid_ref_abs_path): # Create the pid refs file pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) @@ -584,7 +582,6 @@ def tag_object(self, pid, cid): pid, ) return True - else: # All ref files begin as tmp files and get moved sequentially at once # Get tmp files with the expected cid and pid refs content @@ -1332,12 +1329,12 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): try: if not os.path.exists(cid_ref_abs_path): err_msg = ( - "FileHashStore - _delete_cid_refs_file: Cid reference file not found: %s", - cid_ref_abs_path, + "FileHashStore - _delete_cid_refs_file: Cid reference file not found: " + + cid_ref_abs_path ) logging.error(err_msg) raise FileNotFoundError(err_msg) - if os.path.getsize(cid_ref_abs_path) != 0: + elif os.path.getsize(cid_ref_abs_path) != 0: err_msg = ( "FileHashStore - _delete_cid_refs_file: Did not delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " From 26a4daa0c0b2261dd703e683c1de73f3e3645edf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 15 Jan 2024 15:50:08 -0800 Subject: [PATCH 111/420] Add new pytests to check storing and deleting objects with threads --- tests/test_filehashstore_interface.py | 76 +++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 6f418412..2b3640de 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -392,6 +392,8 @@ def test_store_object_duplicate_references_content(pids, store): for _, line in enumerate(f, start=1): value = line.strip() assert value == pid or value == pid_two or value == pid_three + assert len(os.listdir(store.root + "/refs/pid")) == 3 + assert len(os.listdir(store.root + "/refs/cid")) == 1 def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, store): @@ -1054,3 +1056,77 @@ def test_get_hex_digest_algorithm_none(store): algorithm = None with pytest.raises(ValueError): store.get_hex_digest(pid, algorithm) + + +def test_store_and_delete_objects_100_pids_1_cid(store): + """Test that deleting an object that is tagged with 100 pids successfully + deletes all related files""" + test_dir = "tests/testdata/" + path = test_dir + "jtao.1700.1" + # Store + upper_limit = 101 + for i in range(1, upper_limit): + pid_modified = f"dou.test.{str(i)}" + store.store_object(pid_modified, path) + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 100 + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 1 + assert store.count("objects") == 1 + # Delete + for i in range(1, upper_limit): + pid_modified = f"dou.test.{str(i)}" + store.delete_object(pid_modified) + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 + assert store.count("objects") == 0 + + +def test_store_and_delete_object_300_pids_1_cid_threads(store): + """Test store object thread lock.""" + + def store_object_wrapper(pid_var): + try: + test_dir = "tests/testdata/" + path = test_dir + "jtao.1700.1" + upper_limit = 101 + for i in range(1, upper_limit): + pid_modified = f"dou.test.{pid_var}.{str(i)}" + store.store_object(pid_modified, path) + # pylint: disable=W0718 + except Exception as e: + print(e) + + # Store + thread1 = Thread(target=store_object_wrapper, args=("matt",)) + thread2 = Thread(target=store_object_wrapper, args=("matthew",)) + thread3 = Thread(target=store_object_wrapper, args=("matthias",)) + thread1.start() + thread2.start() + thread3.start() + thread1.join() + thread2.join() + thread3.join() + + def delete_object_wrapper(pid_var): + try: + upper_limit = 101 + for i in range(1, upper_limit): + pid_modified = f"dou.test.{pid_var}.{str(i)}" + store.delete_object(pid_modified) + # pylint: disable=W0718 + except Exception as e: + print(e) + + # Delete + thread4 = Thread(target=delete_object_wrapper, args=("matt",)) + thread5 = Thread(target=delete_object_wrapper, args=("matthew",)) + thread6 = Thread(target=delete_object_wrapper, args=("matthias",)) + thread4.start() + thread5.start() + thread6.start() + thread4.join() + thread5.join() + thread6.join() + + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 + assert store.count("objects") == 0 From 7539441a5b62372845b71bd7b4eecb0165b13d18 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 16 Jan 2024 09:30:24 -0800 Subject: [PATCH 112/420] Fix bug in '_delete_cid_refs_pid' where calling 'truncate()' before 'writelines()' caused unexpected behaviour in multiprocessing --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 99c13668..1011df7f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1300,8 +1300,8 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): if cid_pid_line.strip() != pid ] cid_ref_file.seek(0) - cid_ref_file.truncate() cid_ref_file.writelines(new_pid_lines) + cid_ref_file.truncate() # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) From da3ed216fa1423d229b625d6c42471eb8a9784ce Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 16 Jan 2024 11:46:29 -0800 Subject: [PATCH 113/420] Update 'hashstoreclient' to accept a '-gbskip' arg to represent file sizes to skip if desired when testing in knbvm --- src/hashstore/hashstoreclient.py | 72 ++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 2665c574..11ad5774 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -104,6 +104,11 @@ def __init__(self): action="store_true", help="Delete objects in a HashStore", ) + self.parser.add_argument( + "-gbskip", + dest="gb_file_size_to_skip", + help="Number of objects to convert", + ) # Individual API call related optional arguments self.parser.add_argument( @@ -257,17 +262,20 @@ def __init__(self, properties, testflag=None): # Methods relating to testing HashStore with knbvm (test.arcticdata.io) - def store_to_hashstore_from_list(self, origin_dir, obj_type, num): + def store_to_hashstore_from_list(self, origin_dir, obj_type, num, skip_obj_size): """Store objects in a given directory into HashStore. :param str origin_dir: Directory to convert. :param str obj_type: Type of objects ('object' or 'metadata'). :param int num: Number of files to store. + :param int skip_obj_size: Size of obj in GB to skip (ex. 4 = 4GB) """ info_msg = f"HashStoreClient - Begin storing {obj_type} objects." logging.info(info_msg) # Object and Metadata list - metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) + metacat_obj_list = self.metacatdb.get_object_metadata_list( + origin_dir, num, skip_obj_size + ) logging.info(info_msg) # Get list of objects to store from metacat db @@ -332,19 +340,24 @@ def try_store_metadata(self, obj_tuple): except Exception as so_exception: print(so_exception) - def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): + def retrieve_and_validate_from_hashstore( + self, origin_dir, obj_type, num, skip_obj_size + ): """Retrieve objects or metadata from a Hashstore and validate the content. :param str origin_dir: Directory to convert. :param str obj_type: Type of objects ('object' or 'metadata'). :param int num: Number of files to store. + :param int skip_obj_size: Size of obj in GB to skip (ex. 4 = 4GB) """ info_msg = ( f"HashStore Client - Begin retrieving and validating {obj_type} objects." ) logging.info(info_msg) # Object and Metadata list - metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) + metacat_obj_list = self.metacatdb.get_object_metadata_list( + origin_dir, num, skip_obj_size + ) # Get list of objects to store from metacat db logging.info("HashStore Client - Refining object list for %s", obj_type) @@ -428,17 +441,20 @@ def validate_metadata(self, obj_tuple): return - def delete_objects_from_list(self, origin_dir, obj_type, num): + def delete_objects_from_list(self, origin_dir, obj_type, num, skip_obj_size): """Deletes objects in a given directory into HashStore. :param str origin_dir: Directory to convert. :param str obj_type: Type of objects ('object' or 'metadata'). :param int num: Number of files to store. + :param int skip_obj_size: Size of obj in GB to skip (ex. 4 = 4GB) """ info_msg = f"HashStore Client - Begin deleting {obj_type} objects." logging.info(info_msg) # Object and Metadata list - metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) + metacat_obj_list = self.metacatdb.get_object_metadata_list( + origin_dir, num, skip_obj_size + ) # Get list of objects to store from metacat db if obj_type == self.OBJ_TYPE: @@ -537,11 +553,12 @@ def __init__(self, hashstore_path, hashstore): checked_property = yaml_data[key] self.db_yaml_dict[key] = checked_property - def get_object_metadata_list(self, origin_directory, num): + def get_object_metadata_list(self, origin_directory, num, skip_obj_size=None): """Query the Metacat database for the full object and metadata list, ordered by GUID. :param str origin_directory: 'var/metacat/data' or 'var/metacat/documents'. :param int num: Number of rows to retrieve from the Metacat database. + :param int skip_obj_size: Size of obj in GB to skip (ex. 4 = 4GB), defaults to 'None' """ # Create a connection to the database db_user = self.db_yaml_dict["db_user"] @@ -568,7 +585,7 @@ def get_object_metadata_list(self, origin_directory, num): limit_query = f" LIMIT {num}" query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, systemmetadata.object_format, systemmetadata.checksum, - systemmetadata.checksum_algorithm FROM identifier INNER JOIN systemmetadata + systemmetadata.checksum_algorithm, systemmetadata.size FROM identifier INNER JOIN systemmetadata ON identifier.guid = systemmetadata.guid ORDER BY identifier.guid{limit_query};""" cursor.execute(query) @@ -578,21 +595,28 @@ def get_object_metadata_list(self, origin_directory, num): # Create full object list to store into HashStore print("Creating list of objects and metadata from metacat db") object_metadata_list = [] + gb_files_to_skip = skip_obj_size * (1024**3) for row in rows: - # Get pid, filepath and formatId - pid_guid = row[0] - metadatapath_docid_rev = origin_directory + "/" + row[1] + "." + str(row[2]) - metadata_namespace = row[3] - row_checksum = row[4] - row_checksum_algorithm = row[5] - tuple_item = ( - pid_guid, - metadatapath_docid_rev, - metadata_namespace, - row_checksum, - row_checksum_algorithm, - ) - object_metadata_list.append(tuple_item) + size = row[6] + if gb_files_to_skip is not None and size > gb_files_to_skip: + pass + else: + # Get pid, filepath and formatId + pid_guid = row[0] + metadatapath_docid_rev = ( + origin_directory + "/" + row[1] + "." + str(row[2]) + ) + metadata_namespace = row[3] + row_checksum = row[4] + row_checksum_algorithm = row[5] + tuple_item = ( + pid_guid, + metadatapath_docid_rev, + metadata_namespace, + row_checksum, + row_checksum_algorithm, + ) + object_metadata_list.append(tuple_item) # Close the cursor and connection when done cursor.close() @@ -749,6 +773,7 @@ def main(): number_of_objects_to_convert = getattr(args, "num_obj_to_convert") # Determine if we are working with objects or metadata directory_type = getattr(args, "source_directory_type") + size_of_obj_to_skip = getattr(args, "gb_file_size_to_skip") accepted_directory_types = ["object", "metadata"] if directory_type not in accepted_directory_types: raise ValueError( @@ -760,18 +785,21 @@ def main(): directory_to_convert, directory_type, number_of_objects_to_convert, + size_of_obj_to_skip, ) if getattr(args, "retrieve_and_validate"): hashstore_c.retrieve_and_validate_from_hashstore( directory_to_convert, directory_type, number_of_objects_to_convert, + size_of_obj_to_skip, ) if getattr(args, "delete_from_hashstore"): hashstore_c.delete_objects_from_list( directory_to_convert, directory_type, number_of_objects_to_convert, + size_of_obj_to_skip, ) else: raise FileNotFoundError( From d389ed4f5986a04c267e30deb26cf2bef0977069 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 16 Jan 2024 13:02:29 -0800 Subject: [PATCH 114/420] Fix incorrect usage of 'pass' and replace with 'continue' in knbvm loop to refine object list --- src/hashstore/hashstoreclient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 11ad5774..41e3a2c6 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -599,7 +599,7 @@ def get_object_metadata_list(self, origin_directory, num, skip_obj_size=None): for row in rows: size = row[6] if gb_files_to_skip is not None and size > gb_files_to_skip: - pass + continue else: # Get pid, filepath and formatId pid_guid = row[0] From 11ad3c909f67abddb4778f350e009d7419e4a610 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 16 Jan 2024 13:28:56 -0800 Subject: [PATCH 115/420] Refactor '_delete_cid_refs_file' to improve clarity, revise logging statements and update pytests --- src/hashstore/filehashstore.py | 16 ++++++++-------- tests/test_filehashstore_references.py | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1011df7f..4679275f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1328,18 +1328,18 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): try: if not os.path.exists(cid_ref_abs_path): - err_msg = ( - "FileHashStore - _delete_cid_refs_file: Cid reference file not found: " - + cid_ref_abs_path + warn_msg = ( + "FileHashStore - _delete_cid_refs_file: Did not delete cid refs file: " + + f" File not found: {cid_ref_abs_path}" ) - logging.error(err_msg) - raise FileNotFoundError(err_msg) + logging.warning(warn_msg) + return False elif os.path.getsize(cid_ref_abs_path) != 0: - err_msg = ( + warn_msg = ( "FileHashStore - _delete_cid_refs_file: Did not delete cid reference file." - + f" File is not empty: {cid_ref_abs_path} " + + f" File is not empty: {cid_ref_abs_path}" ) - logging.warning(err_msg) + logging.warning(warn_msg) return False else: os.remove(cid_ref_abs_path) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index d1cebd8a..d75ebe81 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -359,8 +359,8 @@ def test_delete_cid_refs_file_file_not_found(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - with pytest.raises(FileNotFoundError): - store._delete_cid_refs_file(cid_ref_abs_path) + is_cid_refs_file_deleted = store._delete_cid_refs_file(cid_ref_abs_path) + assert not is_cid_refs_file_deleted def test_write_pid_refs_file(pids, store): @@ -400,7 +400,7 @@ def test_delete_pid_refs_file_file_not_found(pids, store): for pid in pids.keys(): pid_ref_abs_path = store.get_refs_abs_path("pid", pid) with pytest.raises(FileNotFoundError): - store._delete_cid_refs_file(pid_ref_abs_path) + store._delete_pid_refs_file(pid_ref_abs_path) def test_verify_hashstore_references_pid_refs_file_missing(pids, store): From 8c8c12997c11f8e32ee33a03105baf9ca81618a9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 16 Jan 2024 15:25:03 -0800 Subject: [PATCH 116/420] Fix bug when not specifying object sizes to skip with tests in knbvm --- src/hashstore/hashstoreclient.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 41e3a2c6..69f639c1 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -595,7 +595,10 @@ def get_object_metadata_list(self, origin_directory, num, skip_obj_size=None): # Create full object list to store into HashStore print("Creating list of objects and metadata from metacat db") object_metadata_list = [] - gb_files_to_skip = skip_obj_size * (1024**3) + gb_files_to_skip = None + if skip_obj_size is not None: + gb_files_to_skip = skip_obj_size * (1024**3) + for row in rows: size = row[6] if gb_files_to_skip is not None and size > gb_files_to_skip: From b79f1a8ee01a8a1e2206ff91990b55f2fe41027b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 17 Jan 2024 09:55:37 -0800 Subject: [PATCH 117/420] Add 'find_object' option to hashstoreclient and new pytest --- src/hashstore/hashstoreclient.py | 13 +++++++++++++ tests/test_hashstore_client.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 69f639c1..903476b7 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -154,6 +154,12 @@ def __init__(self): action="store_true", help="Flag to get the hex digest of a data object in HashStore", ) + self.parser.add_argument( + "-findobject", + dest="client_findobject", + action="store_true", + help="Flag to determine if an object is stored in HashStore", + ) self.parser.add_argument( "-storeobject", dest="client_storeobject", @@ -819,6 +825,13 @@ def main(): print(f"algorithm: {algorithm}") print(f"Checksum/Hex Digest: {digest}") + elif getattr(args, "client_findobject"): + if pid is None: + raise ValueError("'-pid' option is required") + # Find the content identifier of the object + cid = hashstore_c.hashstore.find_object(pid) + print(f"Content identifier: {cid}") + elif getattr(args, "client_storeobject"): if pid is None: raise ValueError("'-pid' option is required") diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 8f176452..6eaf16d7 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -77,6 +77,37 @@ def test_get_checksum(capsys, store, pids): assert capsystext == expected_output +def test_find_object(capsys, store, pids): + """Test find_object returns a content identifier if it exists.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + find_object_opt = "-findobject" + client_pid_arg = f"-pid={pid}" + chs_args = [ + client_module_path, + test_store, + find_object_opt, + client_pid_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + hashstoreclient.main() + + capsystext = capsys.readouterr().out + expected_output = f"Content identifier: {cid}\n" + assert capsystext == expected_output + + def test_store_object(store, pids): """Test storing objects to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" From 9d48e37622169f913f19624c71850a2b85e9b9b4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 17 Jan 2024 10:06:51 -0800 Subject: [PATCH 118/420] Revise exception string typo --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4679275f..8fd535cb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1632,7 +1632,7 @@ def _validate_arg_object( # Delete the tmp file self.delete(entity, tmp_file_name) exception_string_for_pid = ( - exception_string + f"Tmp file ({tmp_file_name}) deleted." + exception_string + f". Tmp file ({tmp_file_name}) deleted." ) logging.error(exception_string_for_pid) raise ValueError(exception_string_for_pid) From d5a4bb8fe9000b8b913c92550e357fc03bf480f5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 17 Jan 2024 10:10:44 -0800 Subject: [PATCH 119/420] Update README.md --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 10f4d56a..d7f749c5 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,16 @@ take a longer time to run (relating to the storage of large files) - to execute ## HashStore Client +Client API Options: +- `-getchecksum` (get_hex_digest) +- `-findobject` +- `-storeobject` +- `-storemetadata` +- `-retrieveobject` +- `-retrievemetadata` +- `-deleteobject` +- `-deletemetadata` + How to use HashStore client (command line app) ```sh # Step 1: Create a HashStore @@ -192,6 +202,9 @@ $ python './src/hashstore/hashstoreclient.py' /path/to/store/ -chs -dp=3 -wp=2 - # Get the checksum of a data object $ python './src/hashstore/hashstoreclient.py' /path/to/store/ -getchecksum -pid=content_identifier -algo=SHA-256 +# Find an object (returns the content identifier) +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -findobject -pid=content_identifier + # Store a data object $ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storeobject -pid=content_identifier -path=/path/to/object From 175c4b5d3bd22e2604fe52e1e97c088c8fb290c8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 17 Jan 2024 11:05:15 -0800 Subject: [PATCH 120/420] Update version number from 1.0.0 to 1.1.0 to reflect major changes --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1c9f80d9..c3526c4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hashstore" -version = "1.0.0" +version = "1.1.0" description = "HashStore, a hash-based object store for data packages." authors = ["Matt Jones ", "Dou Mok "] readme = "README.md" From be089daae18e32252f6f80c04676247dcb0f234e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 14:00:36 -0800 Subject: [PATCH 121/420] Refactor 'delete_object' and remove redundant method '_delete_cid_refs_file' and tests --- src/hashstore/filehashstore.py | 53 ++------------------------ tests/test_filehashstore_references.py | 34 ++++++++--------- 2 files changed, 20 insertions(+), 67 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8fd535cb..5343ff96 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -758,9 +758,9 @@ def delete_object(self, pid): # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) self._delete_pid_refs_file(pid_ref_abs_path) - # Delete cid reference file, if the file is not empty, it will not be deleted. - cid_refs_deleted = self._delete_cid_refs_file(cid_ref_abs_path) - if cid_refs_deleted: + # Delete cid reference file and object + if os.path.getsize(cid_ref_abs_path) == 0: + os.remove(cid_ref_abs_path) # If the cid reference file has been deleted, delete the actual object entity = "objects" self.delete(entity, cid) @@ -772,7 +772,7 @@ def delete_object(self, pid): else: info_string = ( "FileHashStore - delete_object: Successfully deleted pid refs file but" - + f" not object with cid ({cid}), cid refs file still has references." + + f" not object with cid ({cid}), cid refs file not empty." ) logging.info(info_string) return True @@ -1313,51 +1313,6 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - def _delete_cid_refs_file(self, cid_ref_abs_path): - """Delete a CID reference file. There must be no references remaining. - - :param str cid_ref_abs_path: Absolute path to the CID reference file. - - :return: True if deleted, False if not. - :rtype: bool - """ - logging.debug( - "FileHashStore - _delete_cid_refs_file: Deleting reference file: %s", - cid_ref_abs_path, - ) - - try: - if not os.path.exists(cid_ref_abs_path): - warn_msg = ( - "FileHashStore - _delete_cid_refs_file: Did not delete cid refs file: " - + f" File not found: {cid_ref_abs_path}" - ) - logging.warning(warn_msg) - return False - elif os.path.getsize(cid_ref_abs_path) != 0: - warn_msg = ( - "FileHashStore - _delete_cid_refs_file: Did not delete cid reference file." - + f" File is not empty: {cid_ref_abs_path}" - ) - logging.warning(warn_msg) - return False - else: - os.remove(cid_ref_abs_path) - debug_msg = ( - "FileHashStore - _delete_cid_refs_file: Deleted cid reference file." - + cid_ref_abs_path - ) - logging.debug(debug_msg) - return True - - except Exception as err: - exception_string = ( - "FileHashStore - _delete_cid_refs_file: failed to delete cid refs file:" - + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err - def _write_pid_refs_file(self, path, cid): """Generate a tmp pid refs file into the given path for the given CID (content identifier). A reference file for a PID contains the CID that it references. diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index d75ebe81..11f402ee 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -333,34 +333,32 @@ def test_delete_cid_refs_pid(pids, store): def test_delete_cid_refs_pid_file(pids, store): - """Test that delete_cid_refs_file deletes a reference file.""" + """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" for pid in pids.keys(): tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) # First remove the pid store._delete_cid_refs_pid(tmp_cid_refs_file, pid) - cid_refs_deleted = store._delete_cid_refs_file(tmp_cid_refs_file) - assert cid_refs_deleted - assert not os.path.exists(tmp_cid_refs_file) + assert os.path.getsize(tmp_cid_refs_file) == 0 -def test_delete_cid_refs_file_file_not_empty(pids, store): - """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" - for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) - is_cid_refs_file_deleted = store._delete_cid_refs_file(tmp_cid_refs_file) - assert not is_cid_refs_file_deleted +# def test_delete_cid_refs_file_file_not_empty(pids, store): +# """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" +# for pid in pids.keys(): +# tmp_root_path = store.get_store_path("refs") / "tmp" +# tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) +# is_cid_refs_file_deleted = store._delete_cid_refs_file(tmp_cid_refs_file) +# assert not is_cid_refs_file_deleted -def test_delete_cid_refs_file_file_not_found(pids, store): - """Test that delete_cid_refs_file raises an exception when refs file not found.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - is_cid_refs_file_deleted = store._delete_cid_refs_file(cid_ref_abs_path) - assert not is_cid_refs_file_deleted +# def test_delete_cid_refs_file_file_not_found(pids, store): +# """Test that delete_cid_refs_file raises an exception when refs file not found.""" +# for pid in pids.keys(): +# cid = pids[pid]["sha256"] +# cid_ref_abs_path = store.get_refs_abs_path("cid", cid) +# is_cid_refs_file_deleted = store._delete_cid_refs_file(cid_ref_abs_path) +# assert not is_cid_refs_file_deleted def test_write_pid_refs_file(pids, store): From 509997188e2ea0aab07eee757b806499d73e01d5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 14:15:31 -0800 Subject: [PATCH 122/420] Refactor 'find_object' to not only get the cid from the pid refs file, but to check that the cid refs file also exists --- src/hashstore/filehashstore.py | 11 ++++++- tests/test_filehashstore_references.py | 42 +++++++------------------- 2 files changed, 21 insertions(+), 32 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5343ff96..0473d2f3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -631,7 +631,16 @@ def find_object(self, pid): pid_refs_cid = pid_ref_file.read() pid_ref_file.close() - return pid_refs_cid + cid_ref_abs_path = self.get_refs_abs_path("cid", pid_refs_cid) + if not os.path.exists(cid_ref_abs_path): + err_msg = ( + f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" + + f", but cid refs file not found: {cid_ref_abs_path}" + ) + logging.error(err_msg) + raise FileNotFoundError(err_msg) + else: + return pid_refs_cid def store_metadata(self, pid, metadata, format_id=None): logging.debug( diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 11f402ee..a9067d83 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -343,24 +343,6 @@ def test_delete_cid_refs_pid_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -# def test_delete_cid_refs_file_file_not_empty(pids, store): -# """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" -# for pid in pids.keys(): -# tmp_root_path = store.get_store_path("refs") / "tmp" -# tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) -# is_cid_refs_file_deleted = store._delete_cid_refs_file(tmp_cid_refs_file) -# assert not is_cid_refs_file_deleted - - -# def test_delete_cid_refs_file_file_not_found(pids, store): -# """Test that delete_cid_refs_file raises an exception when refs file not found.""" -# for pid in pids.keys(): -# cid = pids[pid]["sha256"] -# cid_ref_abs_path = store.get_refs_abs_path("cid", cid) -# is_cid_refs_file_deleted = store._delete_cid_refs_file(cid_ref_abs_path) -# assert not is_cid_refs_file_deleted - - def test_write_pid_refs_file(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): @@ -411,22 +393,20 @@ def test_verify_hashstore_references_pid_refs_file_missing(pids, store): def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): """Test _verify_hashstore_references throws exception when pid refs file cid is incorrect.""" + test_dir = "tests/testdata/" for pid in pids.keys(): - cid = pids[pid]["sha256"] - # Write the cid refs file and move it where it needs to be - tmp_root_path = store.get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - shutil.move(tmp_cid_refs_file, cid_ref_abs_path) - # Write the pid refs file and move it where it needs to be with a bad cid + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + + # Place the wrong cid into the pid refs file that has already been created pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") - shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + pid_ref_file.seek(0) + pid_ref_file.write("intentionally.wrong.pid") + pid_ref_file.truncate() - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): store._verify_hashstore_references(pid, cid, "create") From 5b12f008bba826e7eda3eba6ce91d6db2a7682a5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 14:31:18 -0800 Subject: [PATCH 123/420] Refactor 'get_real_path' to account for cid & pid refs files, and 'delete_object' & '_delete_cid_refs_file' to call '.delete()' --- src/hashstore/filehashstore.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 0473d2f3..f95afe88 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -767,12 +767,10 @@ def delete_object(self, pid): # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) self._delete_pid_refs_file(pid_ref_abs_path) - # Delete cid reference file and object + # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: - os.remove(cid_ref_abs_path) - # If the cid reference file has been deleted, delete the actual object - entity = "objects" - self.delete(entity, cid) + self.delete("cid", cid_ref_abs_path) + self.delete("objects", cid) info_string = ( "FileHashStore - delete_object: Successfully deleted references and" + f" object associated with pid: {pid}" @@ -1370,7 +1368,7 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): ) raise FileNotFoundError(err_msg) else: - os.remove(pid_ref_abs_path) + self.delete("pid", pid_ref_abs_path) except Exception as err: exception_string = ( @@ -1928,6 +1926,10 @@ def get_real_path(self, entity, file): rel_root = self.objects elif entity == "metadata": rel_root = self.metadata + elif entity == "cid": + rel_root = self.refs + "/cid" + elif entity == "pid": + rel_root = self.refs + "/pid" else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" From 657f8a9df5d3073efd2166f9b9ee7e1fd5652df1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 15:05:57 -0800 Subject: [PATCH 124/420] Delete redundant '.close()' statements --- src/hashstore/filehashstore.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f95afe88..3cd4ec7f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -149,7 +149,6 @@ def load_properties(self): self.hashstore_configuration_yaml, "r", encoding="utf-8" ) as hs_yaml_file: yaml_data = yaml.safe_load(hs_yaml_file) - hs_yaml_file.close() # Get hashstore properties hashstore_yaml_dict = {} @@ -227,7 +226,6 @@ def write_properties(self, properties): self.hashstore_configuration_yaml, "w", encoding="utf-8" ) as hs_yaml_file: hs_yaml_file.write(hashstore_configuration_yaml) - hs_yaml_file.close() logging.debug( "FileHashStore - write_properties: Configuration file written to: %s", @@ -397,7 +395,6 @@ def lookup_algo(algo_to_translate): self.hashstore_configuration_yaml, "r", encoding="utf-8" ) as hs_yaml_file: yaml_data = yaml.safe_load(hs_yaml_file) - hs_yaml_file.close() # Set default store algorithm self.algorithm = lookup_algo(yaml_data["store_algorithm"]) @@ -629,7 +626,6 @@ def find_object(self, pid): # Read the file to get the cid from the pid reference with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() - pid_ref_file.close() cid_ref_abs_path = self.get_refs_abs_path("cid", pid_refs_cid) if not os.path.exists(cid_ref_abs_path): @@ -1125,7 +1121,7 @@ def _write_to_tmp_file_and_get_hex_digests( tmp_file.write(self._to_bytes(data)) for hash_algorithm in hash_algorithms: hash_algorithm.update(self._to_bytes(data)) - tmp_file.close() + logging.debug( "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Object stream" + " successfully written to tmp file: %s", @@ -1657,7 +1653,8 @@ def _verify_hashstore_references(self, pid, cid, verify_type): raise FileNotFoundError(exception_string) # Check the content of the reference files # Start with the cid - retrieved_cid = self.find_object(pid) + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + retrieved_cid = f.read() if retrieved_cid != cid: exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file exists" From 044cb62f8d9e06d7de1cb37b754bb0c593ec716d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 15:17:57 -0800 Subject: [PATCH 125/420] Rename 'build_abs_path' method to 'build_path' --- src/hashstore/filehashstore.py | 48 +++++++++++++------------- tests/test_filehashstore.py | 4 +-- tests/test_filehashstore_interface.py | 35 ++++++++++++++----- tests/test_filehashstore_references.py | 25 ++++++++------ 4 files changed, 67 insertions(+), 45 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3cd4ec7f..8ebc6848 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -988,7 +988,7 @@ def _move_and_get_checksums( # Objects are stored with their content identifier based on the store algorithm entity = "objects" object_cid = hex_digests.get(self.algorithm) - abs_file_path = self.build_abs_path(entity, object_cid, extension) + abs_file_path = self.build_path(entity, object_cid, extension) # Only move file if it doesn't exist. We do not check before we create the tmp # file and calculate the hex digests because the given checksum could be incorrect. @@ -1901,6 +1901,27 @@ def create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" + def build_path(self, entity, hash_id, extension=""): + """Build the absolute file path for a given hash ID with an optional file extension. + + :param str entity: Desired entity type (ex. "objects", "metadata"). + :param str hash_id: A hash ID to build a file path for. + :param str extension: An optional file extension to append to the file path. + + :return: An absolute file path for the specified hash ID. + :rtype: str + """ + paths = self.shard(hash_id) + root_dir = self.get_store_path(entity) + + if extension and not extension.startswith(os.extsep): + extension = os.extsep + extension + elif not extension: + extension = "" + + absolute_path = os.path.join(root_dir, *paths) + extension + return absolute_path + def get_real_path(self, entity, file): """Attempt to determine the real path of a file ID or path through successive checking of candidate paths. If the real path is stored with @@ -1936,34 +1957,13 @@ def get_real_path(self, entity, file): return relpath # Check for sharded path. - abspath = self.build_abs_path(entity, file) + abspath = self.build_path(entity, file) if os.path.isfile(abspath): return abspath # Could not determine a match. return None - def build_abs_path(self, entity, hash_id, extension=""): - """Build the absolute file path for a given hash ID with an optional file extension. - - :param str entity: Desired entity type (ex. "objects", "metadata"). - :param str hash_id: A hash ID to build a file path for. - :param str extension: An optional file extension to append to the file path. - - :return: An absolute file path for the specified hash ID. - :rtype: str - """ - paths = self.shard(hash_id) - root_dir = self.get_store_path(entity) - - if extension and not extension.startswith(os.extsep): - extension = os.extsep + extension - elif not extension: - extension = "" - - absolute_path = os.path.join(root_dir, *paths) + extension - return absolute_path - def get_refs_abs_path(self, ref_type, hash_id): """Get the absolute path to the reference file for the given ref_type. @@ -1981,7 +1981,7 @@ def get_refs_abs_path(self, ref_type, hash_id): entity = "refs" if ref_type == "pid": hash_id = self.computehash(hash_id, self.algorithm) - ref_file_abs_path = self.build_abs_path(entity, hash_id).replace( + ref_file_abs_path = self.build_path(entity, hash_id).replace( "/refs/", f"/refs/{ref_type}/" ) return ref_file_abs_path diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index fe309245..a34e3e1d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1032,7 +1032,7 @@ def test_get_real_path_with_bad_entity(store, pids): store.get_real_path(entity, object_metadata.id) -def test_build_abs_path(store, pids): +def test_build_path(store, pids): """Test build_abs_path builds the absolute file path.""" test_dir = "tests/testdata/" entity = "objects" @@ -1040,7 +1040,7 @@ def test_build_abs_path(store, pids): path = test_dir + pid.replace("/", "_") _ = store.store_and_validate_data(pid, path) # pylint: disable=W0212 - abs_path = store.build_abs_path(entity, pids[pid][store.algorithm]) + abs_path = store.build_path(entity, pids[pid][store.algorithm]) assert os.path.exists(abs_path) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 2b3640de..8fb2a195 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -161,7 +161,7 @@ def test_store_object_additional_algorithm_invalid(store): def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): - """Test store object formats algorithm in uppercase.""" + """Test store object formats a given algorithm that's in uppercase.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -581,7 +581,7 @@ def test_store_object_sparse_large_file(store): def test_find_object(pids, store): - """Test find object returns the correct content identifier (cid).""" + """Test find_object returns the correct content identifier (cid).""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -590,6 +590,25 @@ def test_find_object(pids, store): assert cid == object_metadata.hex_digests.get("sha256") +def test_find_object_pid_refs_cid_not_found(pids, store): + """Test find_object throws exception when pid refs file is found with a cid + but the cid does not exist.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + # Place the wrong cid into the pid refs file that has already been created + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + pid_ref_file.seek(0) + pid_ref_file.write("intentionally.wrong.pid") + pid_ref_file.truncate() + + with pytest.raises(FileNotFoundError): + store.find_object(pid) + + def test_find_object_pid_object_does_not_exist(store): """Test find object throws exception when object doesn't exist.""" with pytest.raises(FileNotFoundError): @@ -858,7 +877,7 @@ def test_retrieve_metadata_format_id_empty_spaces(store): store.retrieve_metadata(pid, format_id) -def test_delete_objects(pids, store): +def test_delete_object(pids, store): """Test delete_object successfully deletes objects from /objects.""" test_dir = "tests/testdata/" entity = "objects" @@ -873,8 +892,8 @@ def test_delete_objects(pids, store): assert store.count(entity) == 0 -def test_delete_objects_pid_refs_file(pids, store): - """Test delete_object deletes the pid refs file containing the cid.""" +def test_delete_object_pid_refs_file(pids, store): + """Test delete_object deletes the associated pid refs file for the object.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -888,8 +907,8 @@ def test_delete_objects_pid_refs_file(pids, store): assert not os.path.exists(pid_refs_file_path) -def test_delete_objects_cid_refs_file(pids, store): - """Test delete_object deletes the cid refs file containing the cid.""" +def test_delete_object_cid_refs_file(pids, store): + """Test delete_object deletes the associated cid refs file for the object.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -904,7 +923,7 @@ def test_delete_objects_cid_refs_file(pids, store): assert not os.path.exists(cid_refs_file_path) -def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): +def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): """Test delete_object does not delete the cid refs file that still contains ref.""" test_dir = "tests/testdata/" for pid in pids.keys(): diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index a9067d83..1b7969d2 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -340,6 +340,7 @@ def test_delete_cid_refs_pid_file(pids, store): # First remove the pid store._delete_cid_refs_pid(tmp_cid_refs_file, pid) + assert os.path.exists(tmp_cid_refs_file) assert os.path.getsize(tmp_cid_refs_file) == 0 @@ -393,20 +394,22 @@ def test_verify_hashstore_references_pid_refs_file_missing(pids, store): def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): """Test _verify_hashstore_references throws exception when pid refs file cid is incorrect.""" - test_dir = "tests/testdata/" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.id - - # Place the wrong cid into the pid refs file that has already been created + cid = pids[pid]["sha256"] + # Write the cid refs file and move it where it needs to be + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(tmp_cid_refs_file, cid_ref_abs_path) + # Write the pid refs file and move it where it needs to be with a bad cid pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: - pid_ref_file.seek(0) - pid_ref_file.write("intentionally.wrong.pid") - pid_ref_file.truncate() + store.create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - with pytest.raises(FileNotFoundError): + with pytest.raises(ValueError): store._verify_hashstore_references(pid, cid, "create") From 495b8ee0ada1803eea2a5729c0eeaabfc68417cc Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 15:20:53 -0800 Subject: [PATCH 126/420] Rename 'get_real_path' to 'get_abs_path' and update tests --- src/hashstore/filehashstore.py | 10 +++++----- tests/test_filehashstore.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8ebc6848..3a6122dc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1789,7 +1789,7 @@ def exists(self, entity, file): :return: True if the file exists. :rtype: bool """ - file_exists = bool(self.get_real_path(entity, file)) + file_exists = bool(self.get_abs_path(entity, file)) return file_exists def shard(self, digest): @@ -1829,7 +1829,7 @@ def open(self, entity, file, mode="rb"): :return: An `io` stream dependent on the `mode`. :rtype: io.BufferedReader """ - realpath = self.get_real_path(entity, file) + realpath = self.get_abs_path(entity, file) if realpath is None: raise IOError(f"Could not locate file: {file}") @@ -1845,7 +1845,7 @@ def delete(self, entity, file): :param str entity: Desired entity type (ex. "objects", "metadata"). :param str file: Address ID or path of file. """ - realpath = self.get_real_path(entity, file) + realpath = self.get_abs_path(entity, file) if realpath is None: return None @@ -1922,8 +1922,8 @@ def build_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path - def get_real_path(self, entity, file): - """Attempt to determine the real path of a file ID or path through + def get_abs_path(self, entity, file): + """Attempt to determine the absolute path of a file ID or path through successive checking of candidate paths. If the real path is stored with an extension, the path is considered a match if the basename matches the expected file path of the ID. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a34e3e1d..566687da 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -980,7 +980,7 @@ def test_get_real_path_file_does_not_exist(store): """Test get_real_path returns None when object does not exist.""" entity = "objects" test_path = "tests/testdata/helloworld.txt" - real_path_exists = store.get_real_path(entity, test_path) + real_path_exists = store.get_abs_path(entity, test_path) assert real_path_exists is None @@ -991,7 +991,7 @@ def test_get_real_path_with_object_id(store, pids): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - obj_abs_path = store.get_real_path(entity, object_metadata.id) + obj_abs_path = store.get_abs_path(entity, object_metadata.id) assert os.path.exists(obj_abs_path) @@ -1004,7 +1004,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) - obj_abs_path = store.get_real_path(entity, object_metadata_shard_path) + obj_abs_path = store.get_abs_path(entity, object_metadata_shard_path) assert os.path.exists(obj_abs_path) @@ -1017,7 +1017,7 @@ def test_get_real_path_with_metadata_id(store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_abs_path = store.get_real_path(entity, metadata_cid) + metadata_abs_path = store.get_abs_path(entity, metadata_cid) assert os.path.exists(metadata_abs_path) @@ -1029,7 +1029,7 @@ def test_get_real_path_with_bad_entity(store, pids): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) with pytest.raises(ValueError): - store.get_real_path(entity, object_metadata.id) + store.get_abs_path(entity, object_metadata.id) def test_build_path(store, pids): From e88f741a5f4e85949fde5473a682dd1d812e518b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 15:42:05 -0800 Subject: [PATCH 127/420] Organize and rename methods to improve clarity --- src/hashstore/filehashstore.py | 282 ++++++++++++++++----------------- tests/test_filehashstore.py | 29 ++-- 2 files changed, 156 insertions(+), 155 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3a6122dc..5a9f64eb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -419,7 +419,7 @@ def store_object( checksum_algorithm=None, expected_object_size=None, ): - if pid is None and self._validate_arg_data(data): + if pid is None and self._check_arg_data(data): # If no pid is supplied, store the object only without tagging logging.debug("FileHashStore - store_object: Request to store data only.") object_metadata = self.store_data_only(data) @@ -433,13 +433,13 @@ def store_object( "FileHashStore - store_object: Request to store object for pid: %s", pid ) # Validate input parameters - self._validate_string(pid, "pid", "store_object") - self._validate_arg_data(data) - self._is_int_and_non_negative(expected_object_size) + self._check_string(pid, "pid", "store_object") + self._check_arg_data(data) + self._check_integer(expected_object_size) ( additional_algorithm_checked, checksum_algorithm_checked, - ) = self._validate_arg_algorithms_and_checksum( + ) = self._check_arg_algorithms_and_checksum( additional_algorithm, checksum, checksum_algorithm ) @@ -496,9 +496,9 @@ def store_object( def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - self._validate_string(checksum, "checksum", "verify_object") - self._validate_string(checksum_algorithm, "checksum_algorithm", "verify_object") - self._is_int_and_non_negative(expected_file_size) + self._check_string(checksum, "checksum", "verify_object") + self._check_string(checksum_algorithm, "checksum_algorithm", "verify_object") + self._check_integer(expected_file_size) if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): exception_string = ( "FileHashStore - verify_object: 'object_metadata' cannot be None." @@ -514,7 +514,7 @@ def verify_object( object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - self._validate_arg_object( + self._verify_object_information( pid=None, checksum=checksum, checksum_algorithm=checksum_algorithm_checked, @@ -535,8 +535,8 @@ def tag_object(self, pid, cid): cid, pid, ) - self._validate_string(pid, "pid", "tag_object") - self._validate_string(cid, "cid", "tag_object") + self._check_string(pid, "pid", "tag_object") + self._check_string(cid, "cid", "tag_object") # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -613,7 +613,7 @@ def find_object(self, pid): logging.debug( "FileHashStore - find_object: Request to find object for for pid: %s", pid ) - self._validate_string(pid, "pid", "find_object") + self._check_string(pid, "pid", "find_object") pid_ref_abs_path = self.get_refs_abs_path("pid", pid) if not os.path.exists(pid_ref_abs_path): @@ -643,9 +643,9 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) # Validate input parameters - self._validate_string(pid, "pid", "store_metadata") - checked_format_id = self._validate_arg_format_id(format_id, "store_metadata") - self._validate_arg_data(metadata) + self._check_string(pid, "pid", "store_metadata") + checked_format_id = self._check_arg_format_id(format_id, "store_metadata") + self._check_arg_data(metadata) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: @@ -689,7 +689,7 @@ def retrieve_object(self, pid): "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, ) - self._validate_string(pid, "pid", "retrieve_object") + self._check_string(pid, "pid", "retrieve_object") object_cid = self.find_object(pid) entity = "objects" @@ -717,8 +717,8 @@ def retrieve_metadata(self, pid, format_id=None): "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) - self._validate_string(pid, "pid", "retrieve_metadata") - checked_format_id = self._validate_arg_format_id(format_id, "retrieve_metadata") + self._check_string(pid, "pid", "retrieve_metadata") + checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" metadata_cid = self.computehash(pid + checked_format_id) @@ -741,7 +741,7 @@ def delete_object(self, pid): logging.debug( "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) - self._validate_string(pid, "pid", "delete_object") + self._check_string(pid, "pid", "delete_object") cid = self.find_object(pid) while cid in self.reference_locked_cids: @@ -794,8 +794,8 @@ def delete_metadata(self, pid, format_id=None): "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - self._validate_string(pid, "pid", "delete_metadata") - checked_format_id = self._validate_arg_format_id(format_id, "delete_metadata") + self._check_string(pid, "pid", "delete_metadata") + checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") entity = "metadata" metadata_cid = self.computehash(pid + checked_format_id) @@ -812,8 +812,8 @@ def get_hex_digest(self, pid, algorithm): "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, ) - self._validate_string(pid, "pid", "get_hex_digest") - self._validate_string(algorithm, "algorithm", "get_hex_digest") + self._check_string(pid, "pid", "get_hex_digest") + self._check_string(algorithm, "algorithm", "get_hex_digest") entity = "objects" algorithm = self.clean_algorithm(algorithm) @@ -994,7 +994,7 @@ def _move_and_get_checksums( # file and calculate the hex digests because the given checksum could be incorrect. if not os.path.isfile(abs_file_path): # Files are stored once and only once - self._validate_arg_object( + self._verify_object_information( pid, checksum, checksum_algorithm, @@ -1056,7 +1056,7 @@ def _move_and_get_checksums( else: # If the file exists, determine if the object is what the client states it to be try: - self._validate_arg_object( + self._verify_object_information( pid, checksum, checksum_algorithm, @@ -1118,9 +1118,9 @@ def _write_to_tmp_file_and_get_hex_digests( # tmp is a file-like object that is already opened for writing by default with tmp as tmp_file: for data in stream: - tmp_file.write(self._to_bytes(data)) + tmp_file.write(self._cast_to_bytes(data)) for hash_algorithm in hash_algorithms: - hash_algorithm.update(self._to_bytes(data)) + hash_algorithm.update(self._cast_to_bytes(data)) logging.debug( "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Object stream" @@ -1450,7 +1450,7 @@ def _mktmpmetadata(self, stream): ) with tmp as tmp_file: for data in stream: - tmp_file.write(self._to_bytes(data)) + tmp_file.write(self._cast_to_bytes(data)) logging.debug( "FileHashStore - _mktmpmetadata: Successfully written to tmp metadata file: %s", @@ -1460,75 +1460,7 @@ def _mktmpmetadata(self, stream): # FileHashStore Utility & Supporting Methods - def _validate_arg_data(self, data): - """Checks a data argument to ensure that it is either a string, path, or stream - object. - - :param data: Object to validate (string, path, or stream). - :type data: str, os.PathLike, io.BufferedReader - - :return: True if valid. - :rtype: bool - """ - if ( - not isinstance(data, str) - and not isinstance(data, Path) - and not isinstance(data, io.BufferedIOBase) - ): - exception_string = ( - "FileHashStore - _validate_arg_data: Data must be a path, string or buffered" - + f" stream type. Data type supplied: {type(data)}" - ) - logging.error(exception_string) - raise TypeError(exception_string) - if isinstance(data, str): - if data.replace(" ", "") == "": - exception_string = ( - "FileHashStore - _validate_arg_data: Data string cannot be empty." - ) - logging.error(exception_string) - raise TypeError(exception_string) - return True - - def _validate_arg_algorithms_and_checksum( - self, additional_algorithm, checksum, checksum_algorithm - ): - """Determines whether the caller has supplied the necessary arguments to validate - an object with a checksum value. - - :param additional_algorithm: Value of the additional algorithm to calculate. - :type additional_algorithm: str or None - :param checksum: Value of the checksum. - :type checksum: str or None - :param checksum_algorithm: Algorithm of the checksum. - :type checksum_algorithm: str or None - - :return: Hashlib-compatible string or 'None' for additional_algorithm and - checksum_algorithm. - :rtype: str - """ - additional_algorithm_checked = None - if additional_algorithm != self.algorithm and additional_algorithm is not None: - # Set additional_algorithm - additional_algorithm_checked = self.clean_algorithm(additional_algorithm) - checksum_algorithm_checked = None - if checksum is not None: - self._validate_string( - checksum_algorithm, - "checksum_algorithm", - "_validate_arg_algorithms_and_checksum (store_object)", - ) - if checksum_algorithm is not None: - self._validate_string( - checksum, - "checksum", - "_validate_arg_algorithms_and_checksum (store_object)", - ) - # Set checksum_algorithm - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - return additional_algorithm_checked, checksum_algorithm_checked - - def _validate_arg_object( + def _verify_object_information( self, pid, checksum, @@ -1602,28 +1534,6 @@ def _validate_arg_object( logging.error(exception_string) raise ValueError(exception_string) - def _validate_arg_format_id(self, format_id, method): - """Determines the metadata namespace (format_id) to use for storing, - retrieving, and deleting metadata. - - :param str format_id: Metadata namespace to review. - :param str method: Calling method for logging purposes. - - :return: Valid metadata namespace. - :rtype: str - """ - checked_format_id = None - if format_id is not None and format_id.replace(" ", "") == "": - exception_string = f"FileHashStore - {method}: Format_id cannot be empty." - logging.error(exception_string) - raise ValueError(exception_string) - elif format_id is None: - # Use default value set by hashstore config - checked_format_id = self.sysmeta_ns - else: - checked_format_id = format_id - return checked_format_id - def _verify_hashstore_references(self, pid, cid, verify_type): """Verifies that the supplied pid and pid reference file and content have been written successfully. @@ -1680,6 +1590,96 @@ def _verify_hashstore_references(self, pid, cid, verify_type): logging.error(exception_string) raise ValueError(exception_string) + def _check_arg_data(self, data): + """Checks a data argument to ensure that it is either a string, path, or stream + object. + + :param data: Object to validate (string, path, or stream). + :type data: str, os.PathLike, io.BufferedReader + + :return: True if valid. + :rtype: bool + """ + if ( + not isinstance(data, str) + and not isinstance(data, Path) + and not isinstance(data, io.BufferedIOBase) + ): + exception_string = ( + "FileHashStore - _validate_arg_data: Data must be a path, string or buffered" + + f" stream type. Data type supplied: {type(data)}" + ) + logging.error(exception_string) + raise TypeError(exception_string) + if isinstance(data, str): + if data.replace(" ", "") == "": + exception_string = ( + "FileHashStore - _validate_arg_data: Data string cannot be empty." + ) + logging.error(exception_string) + raise TypeError(exception_string) + return True + + def _check_arg_algorithms_and_checksum( + self, additional_algorithm, checksum, checksum_algorithm + ): + """Determines whether the caller has supplied the necessary arguments to validate + an object with a checksum value. + + :param additional_algorithm: Value of the additional algorithm to calculate. + :type additional_algorithm: str or None + :param checksum: Value of the checksum. + :type checksum: str or None + :param checksum_algorithm: Algorithm of the checksum. + :type checksum_algorithm: str or None + + :return: Hashlib-compatible string or 'None' for additional_algorithm and + checksum_algorithm. + :rtype: str + """ + additional_algorithm_checked = None + if additional_algorithm != self.algorithm and additional_algorithm is not None: + # Set additional_algorithm + additional_algorithm_checked = self.clean_algorithm(additional_algorithm) + checksum_algorithm_checked = None + if checksum is not None: + self._check_string( + checksum_algorithm, + "checksum_algorithm", + "_check_arg_algorithms_and_checksum (store_object)", + ) + if checksum_algorithm is not None: + self._check_string( + checksum, + "checksum", + "_check_arg_algorithms_and_checksum (store_object)", + ) + # Set checksum_algorithm + checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + return additional_algorithm_checked, checksum_algorithm_checked + + def _check_arg_format_id(self, format_id, method): + """Determines the metadata namespace (format_id) to use for storing, + retrieving, and deleting metadata. + + :param str format_id: Metadata namespace to review. + :param str method: Calling method for logging purposes. + + :return: Valid metadata namespace. + :rtype: str + """ + checked_format_id = None + if format_id is not None and format_id.replace(" ", "") == "": + exception_string = f"FileHashStore - {method}: Format_id cannot be empty." + logging.error(exception_string) + raise ValueError(exception_string) + elif format_id is None: + # Use default value set by hashstore config + checked_format_id = self.sysmeta_ns + else: + checked_format_id = format_id + return checked_format_id + def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): """Create the final list of hash algorithms to calculate. @@ -1760,26 +1760,10 @@ def computehash(self, stream, algorithm=None): check_algorithm = self.clean_algorithm(algorithm) hashobj = hashlib.new(check_algorithm) for data in stream: - hashobj.update(self._to_bytes(data)) + hashobj.update(self._cast_to_bytes(data)) hex_digest = hashobj.hexdigest() return hex_digest - def get_store_path(self, entity): - """Return a path object of the root directory of the store. - - :param str entity: Desired entity type: "objects" or "metadata" - """ - if entity == "objects": - return Path(self.objects) - elif entity == "metadata": - return Path(self.metadata) - elif entity == "refs": - return Path(self.refs) - else: - raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" - ) - def exists(self, entity, file): """Check whether a given file id or path exists on disk. @@ -1986,6 +1970,22 @@ def get_refs_abs_path(self, ref_type, hash_id): ) return ref_file_abs_path + def get_store_path(self, entity): + """Return a path object of the root directory of the store. + + :param str entity: Desired entity type: "objects" or "metadata" + """ + if entity == "objects": + return Path(self.objects) + elif entity == "metadata": + return Path(self.metadata) + elif entity == "refs": + return Path(self.refs) + else: + raise ValueError( + f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" + ) + def count(self, entity): """Return the count of the number of files in the `root` directory. @@ -2017,7 +2017,7 @@ def count(self, entity): # Other Static Methods @staticmethod - def _is_int_and_non_negative(file_size): + def _check_integer(file_size): """Check whether a given argument is an integer and greater than 0; throw an exception if not. @@ -2026,20 +2026,20 @@ def _is_int_and_non_negative(file_size): if file_size is not None: if not isinstance(file_size, int): exception_string = ( - "FileHashStore - _is_int_and_non_negative: size given must be an integer." + "FileHashStore - _check_integer: size given must be an integer." + f" File size: {file_size}. Arg Type: {type(file_size)}." ) logging.error(exception_string) raise TypeError(exception_string) if file_size < 1: exception_string = ( - "FileHashStore - _is_int_and_non_negative: size given must be > 0" + "FileHashStore - _check_integer: size given must be > 0" ) logging.error(exception_string) raise ValueError(exception_string) @staticmethod - def _validate_string(string, arg, method): + def _check_string(string, arg, method): """Check whether a string is None or empty; throw an exception if so. :param str string: Value to check. @@ -2055,7 +2055,7 @@ def _validate_string(string, arg, method): raise ValueError(exception_string) @staticmethod - def _to_bytes(text): + def _cast_to_bytes(text): """Convert text to a sequence of bytes using utf-8 encoding. :param str text: String to convert. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 566687da..d36cde9a 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -666,8 +666,8 @@ def test_mktmpmetadata(pids, store): # Tests for FileHashStore Utility & Supporting Methods -def test_validate_arg_object(pids, store): - """Test _validate_arg_object succeeds given good arguments.""" +def test_verify_object_information(pids, store): + """Test _verify_object_information succeeds given good arguments.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -677,7 +677,7 @@ def test_validate_arg_object(pids, store): checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size # pylint: disable=W0212 - store._validate_arg_object( + store._verify_object_information( None, checksum, checksum_algorithm, @@ -689,8 +689,8 @@ def test_validate_arg_object(pids, store): ) -def test_validate_arg_object_incorrect_size(pids, store): - """Test _validate_arg_object throws exception when size is incorrect.""" +def test_verify_object_information_incorrect_size(pids, store): + """Test _verify_object_information throws exception when size is incorrect.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -700,7 +700,7 @@ def test_validate_arg_object_incorrect_size(pids, store): checksum_algorithm = store.algorithm with pytest.raises(ValueError): # pylint: disable=W0212 - store._validate_arg_object( + store._verify_object_information( None, checksum, checksum_algorithm, @@ -712,8 +712,8 @@ def test_validate_arg_object_incorrect_size(pids, store): ) -def test_validate_arg_object_incorrect_size_with_pid(pids, store): - """Test _validate_arg_object deletes the expected tmp file if obj size does +def test_verify_object_information_incorrect_size_with_pid(pids, store): + """Test _verify_object_information deletes the expected tmp file if obj size does not match and raises an exception.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -729,7 +729,7 @@ def test_validate_arg_object_incorrect_size_with_pid(pids, store): tmp_file = store._mktmpfile(objects_tmp_folder) assert os.path.isfile(tmp_file.name) with pytest.raises(ValueError): - store._validate_arg_object( + store._verify_object_information( "Test_Pid", checksum, checksum_algorithm, @@ -742,8 +742,9 @@ def test_validate_arg_object_incorrect_size_with_pid(pids, store): assert not os.path.isfile(tmp_file.name) -def test_validate_arg_object_missing_key_in_hex_digests(pids, store): - """Test _validate_arg_object throws exception when algorithm is not found in hex digests.""" +def test_verify_object_information_missing_key_in_hex_digests(pids, store): + """Test _verify_object_information throws exception when algorithm is not found + in hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -753,7 +754,7 @@ def test_validate_arg_object_missing_key_in_hex_digests(pids, store): expected_file_size = object_metadata.obj_size with pytest.raises(KeyError): # pylint: disable=W0212 - store._validate_arg_object( + store._verify_object_information( None, checksum, checksum_algorithm, @@ -1054,9 +1055,9 @@ def test_count(pids, store): assert store.count(entity) == 3 -def test_to_bytes(store): +def test_cast_to_bytes(store): """Test _to_bytes returns bytes.""" string = "teststring" # pylint: disable=W0212 - string_bytes = store._to_bytes(string) + string_bytes = store._cast_to_bytes(string) assert isinstance(string_bytes, bytes) From f9b414c54c7752047361e31571b474d45623f2b4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 16:55:17 -0800 Subject: [PATCH 128/420] Rename 'get_abs_path' method to 'resolve_path' to improve clarity --- src/hashstore/filehashstore.py | 8 ++++---- tests/test_filehashstore.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5a9f64eb..10836565 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1773,7 +1773,7 @@ def exists(self, entity, file): :return: True if the file exists. :rtype: bool """ - file_exists = bool(self.get_abs_path(entity, file)) + file_exists = bool(self.resolve_path(entity, file)) return file_exists def shard(self, digest): @@ -1813,7 +1813,7 @@ def open(self, entity, file, mode="rb"): :return: An `io` stream dependent on the `mode`. :rtype: io.BufferedReader """ - realpath = self.get_abs_path(entity, file) + realpath = self.resolve_path(entity, file) if realpath is None: raise IOError(f"Could not locate file: {file}") @@ -1829,7 +1829,7 @@ def delete(self, entity, file): :param str entity: Desired entity type (ex. "objects", "metadata"). :param str file: Address ID or path of file. """ - realpath = self.get_abs_path(entity, file) + realpath = self.resolve_path(entity, file) if realpath is None: return None @@ -1906,7 +1906,7 @@ def build_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path - def get_abs_path(self, entity, file): + def resolve_path(self, entity, file): """Attempt to determine the absolute path of a file ID or path through successive checking of candidate paths. If the real path is stored with an extension, the path is considered a match if the basename matches diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index d36cde9a..4688ba87 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -981,7 +981,7 @@ def test_get_real_path_file_does_not_exist(store): """Test get_real_path returns None when object does not exist.""" entity = "objects" test_path = "tests/testdata/helloworld.txt" - real_path_exists = store.get_abs_path(entity, test_path) + real_path_exists = store.resolve_path(entity, test_path) assert real_path_exists is None @@ -992,7 +992,7 @@ def test_get_real_path_with_object_id(store, pids): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - obj_abs_path = store.get_abs_path(entity, object_metadata.id) + obj_abs_path = store.resolve_path(entity, object_metadata.id) assert os.path.exists(obj_abs_path) @@ -1005,7 +1005,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) - obj_abs_path = store.get_abs_path(entity, object_metadata_shard_path) + obj_abs_path = store.resolve_path(entity, object_metadata_shard_path) assert os.path.exists(obj_abs_path) @@ -1018,7 +1018,7 @@ def test_get_real_path_with_metadata_id(store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_abs_path = store.get_abs_path(entity, metadata_cid) + metadata_abs_path = store.resolve_path(entity, metadata_cid) assert os.path.exists(metadata_abs_path) @@ -1030,7 +1030,7 @@ def test_get_real_path_with_bad_entity(store, pids): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) with pytest.raises(ValueError): - store.get_abs_path(entity, object_metadata.id) + store.resolve_path(entity, object_metadata.id) def test_build_path(store, pids): From d434c3f697ed1b5588c46efcecf92e109d8167b3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 18 Jan 2024 16:59:02 -0800 Subject: [PATCH 129/420] Rename 'get_refs_abs_path' to 'get_refs_path' --- src/hashstore/filehashstore.py | 28 +++++++--------- tests/test_filehashstore_interface.py | 14 ++++---- tests/test_filehashstore_references.py | 44 +++++++++++++------------- 3 files changed, 41 insertions(+), 45 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 10836565..31466e2e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -552,8 +552,8 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) - cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + pid_ref_abs_path = self.get_refs_path("pid", pid) + cid_ref_abs_path = self.get_refs_path("cid", cid) tmp_root_path = self.get_store_path("refs") / "tmp" # Proceed to tagging process @@ -615,7 +615,7 @@ def find_object(self, pid): ) self._check_string(pid, "pid", "find_object") - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + pid_ref_abs_path = self.get_refs_path("pid", pid) if not os.path.exists(pid_ref_abs_path): err_msg = ( f"FileHashStore - find_object: pid ({pid}) reference file not found: " @@ -627,7 +627,7 @@ def find_object(self, pid): with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() - cid_ref_abs_path = self.get_refs_abs_path("cid", pid_refs_cid) + cid_ref_abs_path = self.get_refs_path("cid", pid_refs_cid) if not os.path.exists(cid_ref_abs_path): err_msg = ( f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" @@ -758,8 +758,8 @@ def delete_object(self, pid): ) self.reference_locked_cids.append(cid) try: - cid_ref_abs_path = self.get_refs_abs_path("cid", cid) - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + cid_ref_abs_path = self.get_refs_path("cid", cid) + pid_ref_abs_path = self.get_refs_path("pid", pid) # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) self._delete_pid_refs_file(pid_ref_abs_path) @@ -1529,7 +1529,7 @@ def _verify_object_information( else: # Delete the object cid = hex_digests[self.algorithm] - cid_abs_path = self.get_refs_abs_path("cid", cid) + cid_abs_path = self.get_refs_path("cid", cid) self.delete(entity, cid_abs_path) logging.error(exception_string) raise ValueError(exception_string) @@ -1543,8 +1543,8 @@ def _verify_hashstore_references(self, pid, cid, verify_type): :param str verify_type: "update" or "create" """ # Check that reference files were created - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) - cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + pid_ref_abs_path = self.get_refs_path("pid", pid) + cid_ref_abs_path = self.get_refs_path("cid", cid) if not os.path.exists(pid_ref_abs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " @@ -1912,7 +1912,7 @@ def resolve_path(self, entity, file): an extension, the path is considered a match if the basename matches the expected file path of the ID. - :param str entity: Desired entity type (ex. "objects", "metadata"). + :param str entity: Desired entity type ("objects" or "metadata"). :param str file: Name of the file. :return: Whether the file is found or not. @@ -1928,10 +1928,6 @@ def resolve_path(self, entity, file): rel_root = self.objects elif entity == "metadata": rel_root = self.metadata - elif entity == "cid": - rel_root = self.refs + "/cid" - elif entity == "pid": - rel_root = self.refs + "/pid" else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" @@ -1948,8 +1944,8 @@ def resolve_path(self, entity, file): # Could not determine a match. return None - def get_refs_abs_path(self, ref_type, hash_id): - """Get the absolute path to the reference file for the given ref_type. + def get_refs_path(self, ref_type, hash_id): + """Compute the absolute path to the reference file for the given ref_type. If a 'pid' is provided, this method will calculate the pid's hash based on the store algorithm and return the expected address of the pid reference file. If a 'cid' is diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 8fb2a195..9ae5e54b 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -365,7 +365,7 @@ def test_store_object_duplicate_references_files(pids, store): # Confirm that there are 1 cid reference files assert store.count("cid") == 1 # Confirm the content of the cid refence files - cid_ref_abs_path = store.get_refs_abs_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store.get_refs_path("cid", pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -387,7 +387,7 @@ def test_store_object_duplicate_references_content(pids, store): pid_three = "dou.test.2" store.store_object(pid_three, path) # Confirm the content of the cid refence files - cid_ref_abs_path = store.get_refs_abs_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store.get_refs_path("cid", pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -599,7 +599,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): _object_metadata = store.store_object(pid, path) # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + pid_ref_abs_path = store.get_refs_path("pid", pid) with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: pid_ref_file.seek(0) pid_ref_file.write("intentionally.wrong.pid") @@ -903,7 +903,7 @@ def test_delete_object_pid_refs_file(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) + pid_refs_file_path = store.get_refs_path("pid", pid) assert not os.path.exists(pid_refs_file_path) @@ -919,7 +919,7 @@ def test_delete_object_cid_refs_file(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) cid = object_metadata.id store.delete_object(pid) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) + cid_refs_file_path = store.get_refs_path("cid", cid) assert not os.path.exists(cid_refs_file_path) @@ -930,11 +930,11 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) cid = object_metadata.id - cid_refs_abs_path = store.get_refs_abs_path("cid", cid) + cid_refs_abs_path = store.get_refs_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") store.delete_object(pid) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) + cid_refs_file_path = store.get_refs_path("cid", cid) assert os.path.exists(cid_refs_file_path) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 1b7969d2..f69fb950 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -23,7 +23,7 @@ def test_tag_object_pid_refs_file(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) + pid_refs_file_path = store.get_refs_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -35,9 +35,9 @@ def test_tag_object_pid_refs_file_exists(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, cid) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) + pid_refs_file_path = store.get_refs_path("pid", pid) assert os.path.exists(pid_refs_file_path) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) + cid_refs_file_path = store.get_refs_path("cid", cid) assert os.path.exists(cid_refs_file_path) with pytest.raises(FileExistsError): store.tag_object(pid, cid) @@ -50,7 +50,7 @@ def test_tag_object_pid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) + pid_refs_file_path = store.get_refs_path("pid", pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() assert pid_refs_cid == object_metadata.id @@ -64,7 +64,7 @@ def test_tag_object_cid_refs_file(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) + cid_refs_file_path = store.get_refs_path("cid", cid) assert os.path.exists(cid_refs_file_path) @@ -75,7 +75,7 @@ def test_tag_object_cid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) + cid_refs_file_path = store.get_refs_path("cid", object_metadata.id) with open(cid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read().strip() assert pid_refs_cid == pid @@ -93,7 +93,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): with pytest.raises(FileExistsError): store.tag_object(pid, another_cid) - second_cid_hash = store.get_refs_abs_path("cid", another_cid) + second_cid_hash = store.get_refs_path("cid", another_cid) assert not os.path.exists(second_cid_hash) @@ -112,7 +112,7 @@ def test_tag_object_cid_refs_update_cid_refs_updated(store): store.tag_object(additional_pid, cid) # Read cid file to confirm cid refs file contains the additional pid - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + cid_ref_abs_path = store.get_refs_path("cid", cid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -134,7 +134,7 @@ def test_tag_object_cid_refs_update_pid_refs_created(store): additional_pid = "dou.test.1" store.tag_object(additional_pid, cid) - pid_refs_file_path = store.get_refs_abs_path("pid", additional_pid) + pid_refs_file_path = store.get_refs_path("pid", additional_pid) assert os.path.exists(pid_refs_file_path) @@ -149,11 +149,11 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): cid = object_metadata.id # Manually update the cid refs, pid refs file missing at this point additional_pid = "dou.test.1" - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + cid_ref_abs_path = store.get_refs_path("cid", cid) store._update_cid_refs(cid_ref_abs_path, additional_pid) # Confirm the pid refs file is missing - pid_refs_file_path = store.get_refs_abs_path("pid", additional_pid) + pid_refs_file_path = store.get_refs_path("pid", additional_pid) assert not os.path.exists(pid_refs_file_path) # Call tag_object, this should create the missing pid refs file @@ -206,7 +206,7 @@ def test_verify_object_exception_incorrect_size(pids, store): cid = object_metadata.id cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store.get_refs_abs_path("cid", cid) + cid_abs_path = store.get_refs_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -227,7 +227,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): cid = object_metadata.id cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store.get_refs_abs_path("cid", cid) + cid_abs_path = store.get_refs_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -310,7 +310,7 @@ def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + cid_ref_abs_path = store.get_refs_path("cid", cid) with pytest.raises(FileNotFoundError): store._update_cid_refs(cid_ref_abs_path, pid) @@ -379,7 +379,7 @@ def test_delete_pid_refs_file(pids, store): def test_delete_pid_refs_file_file_not_found(pids, store): """Test that delete_pid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + pid_ref_abs_path = store.get_refs_path("pid", pid) with pytest.raises(FileNotFoundError): store._delete_pid_refs_file(pid_ref_abs_path) @@ -399,11 +399,11 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): # Write the cid refs file and move it where it needs to be tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + cid_ref_abs_path = store.get_refs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Write the pid refs file and move it where it needs to be with a bad cid - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + pid_ref_abs_path = store.get_refs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store.get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") @@ -417,7 +417,7 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when cid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + pid_ref_abs_path = store.get_refs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store.get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") @@ -435,11 +435,11 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): # Get a tmp cid refs file and write the wrong pid into it tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + cid_ref_abs_path = store.get_refs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs file, both cid and pid refs must be present - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + pid_ref_abs_path = store.get_refs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store.get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) @@ -459,11 +459,11 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi # Write the wrong pid into a cid refs file and move it where it needs to be tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + cid_ref_abs_path = store.get_refs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs with expected values - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + pid_ref_abs_path = store.get_refs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store.get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) From 4ad9926f391ca2cc3a21101485326a480194b585 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 19 Jan 2024 08:53:55 -0800 Subject: [PATCH 130/420] Absorb 'get_refs_path' functionality into 'resolve_path' method and update affected code and tests --- src/hashstore/filehashstore.py | 81 +++++++++++--------------- tests/test_filehashstore_interface.py | 15 ++--- tests/test_filehashstore_references.py | 44 +++++++------- 3 files changed, 63 insertions(+), 77 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 31466e2e..2420fbc1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -552,8 +552,8 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - pid_ref_abs_path = self.get_refs_path("pid", pid) - cid_ref_abs_path = self.get_refs_path("cid", cid) + pid_ref_abs_path = self.resolve_path("pid", pid) + cid_ref_abs_path = self.resolve_path("cid", cid) tmp_root_path = self.get_store_path("refs") / "tmp" # Proceed to tagging process @@ -615,7 +615,7 @@ def find_object(self, pid): ) self._check_string(pid, "pid", "find_object") - pid_ref_abs_path = self.get_refs_path("pid", pid) + pid_ref_abs_path = self.resolve_path("pid", pid) if not os.path.exists(pid_ref_abs_path): err_msg = ( f"FileHashStore - find_object: pid ({pid}) reference file not found: " @@ -627,7 +627,7 @@ def find_object(self, pid): with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() - cid_ref_abs_path = self.get_refs_path("cid", pid_refs_cid) + cid_ref_abs_path = self.resolve_path("cid", pid_refs_cid) if not os.path.exists(cid_ref_abs_path): err_msg = ( f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" @@ -758,8 +758,8 @@ def delete_object(self, pid): ) self.reference_locked_cids.append(cid) try: - cid_ref_abs_path = self.get_refs_path("cid", cid) - pid_ref_abs_path = self.get_refs_path("pid", pid) + cid_ref_abs_path = self.resolve_path("cid", cid) + pid_ref_abs_path = self.resolve_path("pid", pid) # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) self._delete_pid_refs_file(pid_ref_abs_path) @@ -1529,7 +1529,7 @@ def _verify_object_information( else: # Delete the object cid = hex_digests[self.algorithm] - cid_abs_path = self.get_refs_path("cid", cid) + cid_abs_path = self.resolve_path("cid", cid) self.delete(entity, cid_abs_path) logging.error(exception_string) raise ValueError(exception_string) @@ -1543,8 +1543,8 @@ def _verify_hashstore_references(self, pid, cid, verify_type): :param str verify_type: "update" or "create" """ # Check that reference files were created - pid_ref_abs_path = self.get_refs_path("pid", pid) - cid_ref_abs_path = self.get_refs_path("cid", cid) + pid_ref_abs_path = self.resolve_path("pid", pid) + cid_ref_abs_path = self.resolve_path("cid", cid) if not os.path.exists(pid_ref_abs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " @@ -1908,15 +1908,14 @@ def build_path(self, entity, hash_id, extension=""): def resolve_path(self, entity, file): """Attempt to determine the absolute path of a file ID or path through - successive checking of candidate paths. If the real path is stored with - an extension, the path is considered a match if the basename matches - the expected file path of the ID. + successive checking of candidate paths. - :param str entity: Desired entity type ("objects" or "metadata"). + :param str entity: Desired entity type ("objects", "metadata", "cid", "pid"), + where "cid" & "pid" represents resolving the path to the refs files. :param str file: Name of the file. - :return: Whether the file is found or not. - :rtype: bool + :return: Path to file + :rtype: str """ # Check for absolute path. if os.path.isfile(file): @@ -1926,50 +1925,32 @@ def resolve_path(self, entity, file): rel_root = "" if entity == "objects": rel_root = self.objects - elif entity == "metadata": + if entity == "metadata": rel_root = self.metadata - else: - raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" - ) relpath = os.path.join(rel_root, file) if os.path.isfile(relpath): return relpath # Check for sharded path. - abspath = self.build_path(entity, file) - if os.path.isfile(abspath): - return abspath - - # Could not determine a match. - return None - - def get_refs_path(self, ref_type, hash_id): - """Compute the absolute path to the reference file for the given ref_type. - - If a 'pid' is provided, this method will calculate the pid's hash based on the store - algorithm and return the expected address of the pid reference file. If a 'cid' is - provided, this method will return the expected address by sharding the cid based on - HashStore's configuration. - - :param str ref_type: 'pid' or 'cid' - :param str hash_id: Authority-based, persistent, or hash identifier - - :return: Path to the reference file for the given type and ID. - :rtype: str - """ - entity = "refs" - if ref_type == "pid": - hash_id = self.computehash(hash_id, self.algorithm) - ref_file_abs_path = self.build_path(entity, hash_id).replace( - "/refs/", f"/refs/{ref_type}/" - ) - return ref_file_abs_path + if entity == "cid": + ref_file_abs_path = self.build_path(entity, file) + return ref_file_abs_path + elif entity == "pid": + hash_id = self.computehash(file, self.algorithm) + ref_file_abs_path = self.build_path(entity, hash_id) + return ref_file_abs_path + else: + abspath = self.build_path(entity, file) + if os.path.isfile(abspath): + return abspath def get_store_path(self, entity): """Return a path object of the root directory of the store. :param str entity: Desired entity type: "objects" or "metadata" + + :return: Path to requested store entity type + :rtype: Path """ if entity == "objects": return Path(self.objects) @@ -1977,6 +1958,10 @@ def get_store_path(self, entity): return Path(self.metadata) elif entity == "refs": return Path(self.refs) + elif entity == "cid": + return Path(self.refs) / "cid" + elif entity == "pid": + return Path(self.refs) / "pid" else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 9ae5e54b..f38f46d9 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -365,7 +365,7 @@ def test_store_object_duplicate_references_files(pids, store): # Confirm that there are 1 cid reference files assert store.count("cid") == 1 # Confirm the content of the cid refence files - cid_ref_abs_path = store.get_refs_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store.resolve_path("cid", pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -387,11 +387,12 @@ def test_store_object_duplicate_references_content(pids, store): pid_three = "dou.test.2" store.store_object(pid_three, path) # Confirm the content of the cid refence files - cid_ref_abs_path = store.get_refs_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store.resolve_path("cid", pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() assert value == pid or value == pid_two or value == pid_three + print(os.listdir(store.root + "/refs/pid/")) assert len(os.listdir(store.root + "/refs/pid")) == 3 assert len(os.listdir(store.root + "/refs/cid")) == 1 @@ -599,7 +600,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): _object_metadata = store.store_object(pid, path) # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store.get_refs_path("pid", pid) + pid_ref_abs_path = store.resolve_path("pid", pid) with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: pid_ref_file.seek(0) pid_ref_file.write("intentionally.wrong.pid") @@ -903,7 +904,7 @@ def test_delete_object_pid_refs_file(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - pid_refs_file_path = store.get_refs_path("pid", pid) + pid_refs_file_path = store.resolve_path("pid", pid) assert not os.path.exists(pid_refs_file_path) @@ -919,7 +920,7 @@ def test_delete_object_cid_refs_file(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) cid = object_metadata.id store.delete_object(pid) - cid_refs_file_path = store.get_refs_path("cid", cid) + cid_refs_file_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_refs_file_path) @@ -930,11 +931,11 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) cid = object_metadata.id - cid_refs_abs_path = store.get_refs_path("cid", cid) + cid_refs_abs_path = store.resolve_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") store.delete_object(pid) - cid_refs_file_path = store.get_refs_path("cid", cid) + cid_refs_file_path = store.resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index f69fb950..ccd147fd 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -23,7 +23,7 @@ def test_tag_object_pid_refs_file(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) - pid_refs_file_path = store.get_refs_path("pid", pid) + pid_refs_file_path = store.resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -35,9 +35,9 @@ def test_tag_object_pid_refs_file_exists(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, cid) - pid_refs_file_path = store.get_refs_path("pid", pid) + pid_refs_file_path = store.resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) - cid_refs_file_path = store.get_refs_path("cid", cid) + cid_refs_file_path = store.resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) with pytest.raises(FileExistsError): store.tag_object(pid, cid) @@ -50,7 +50,7 @@ def test_tag_object_pid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) - pid_refs_file_path = store.get_refs_path("pid", pid) + pid_refs_file_path = store.resolve_path("pid", pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() assert pid_refs_cid == object_metadata.id @@ -64,7 +64,7 @@ def test_tag_object_cid_refs_file(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.get_refs_path("cid", cid) + cid_refs_file_path = store.resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) @@ -75,7 +75,7 @@ def test_tag_object_cid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.get_refs_path("cid", object_metadata.id) + cid_refs_file_path = store.resolve_path("cid", object_metadata.id) with open(cid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read().strip() assert pid_refs_cid == pid @@ -93,7 +93,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): with pytest.raises(FileExistsError): store.tag_object(pid, another_cid) - second_cid_hash = store.get_refs_path("cid", another_cid) + second_cid_hash = store.resolve_path("cid", another_cid) assert not os.path.exists(second_cid_hash) @@ -112,7 +112,7 @@ def test_tag_object_cid_refs_update_cid_refs_updated(store): store.tag_object(additional_pid, cid) # Read cid file to confirm cid refs file contains the additional pid - cid_ref_abs_path = store.get_refs_path("cid", cid) + cid_ref_abs_path = store.resolve_path("cid", cid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -134,7 +134,7 @@ def test_tag_object_cid_refs_update_pid_refs_created(store): additional_pid = "dou.test.1" store.tag_object(additional_pid, cid) - pid_refs_file_path = store.get_refs_path("pid", additional_pid) + pid_refs_file_path = store.resolve_path("pid", additional_pid) assert os.path.exists(pid_refs_file_path) @@ -149,11 +149,11 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): cid = object_metadata.id # Manually update the cid refs, pid refs file missing at this point additional_pid = "dou.test.1" - cid_ref_abs_path = store.get_refs_path("cid", cid) + cid_ref_abs_path = store.resolve_path("cid", cid) store._update_cid_refs(cid_ref_abs_path, additional_pid) # Confirm the pid refs file is missing - pid_refs_file_path = store.get_refs_path("pid", additional_pid) + pid_refs_file_path = store.resolve_path("pid", additional_pid) assert not os.path.exists(pid_refs_file_path) # Call tag_object, this should create the missing pid refs file @@ -206,7 +206,7 @@ def test_verify_object_exception_incorrect_size(pids, store): cid = object_metadata.id cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store.get_refs_path("cid", cid) + cid_abs_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -227,7 +227,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): cid = object_metadata.id cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store.get_refs_path("cid", cid) + cid_abs_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -310,7 +310,7 @@ def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_path("cid", cid) + cid_ref_abs_path = store.resolve_path("cid", cid) with pytest.raises(FileNotFoundError): store._update_cid_refs(cid_ref_abs_path, pid) @@ -379,7 +379,7 @@ def test_delete_pid_refs_file(pids, store): def test_delete_pid_refs_file_file_not_found(pids, store): """Test that delete_pid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): - pid_ref_abs_path = store.get_refs_path("pid", pid) + pid_ref_abs_path = store.resolve_path("pid", pid) with pytest.raises(FileNotFoundError): store._delete_pid_refs_file(pid_ref_abs_path) @@ -399,11 +399,11 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): # Write the cid refs file and move it where it needs to be tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) - cid_ref_abs_path = store.get_refs_path("cid", cid) + cid_ref_abs_path = store.resolve_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Write the pid refs file and move it where it needs to be with a bad cid - pid_ref_abs_path = store.get_refs_path("pid", pid) + pid_ref_abs_path = store.resolve_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store.get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") @@ -417,7 +417,7 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when cid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_path("pid", pid) + pid_ref_abs_path = store.resolve_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store.get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") @@ -435,11 +435,11 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): # Get a tmp cid refs file and write the wrong pid into it tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - cid_ref_abs_path = store.get_refs_path("cid", cid) + cid_ref_abs_path = store.resolve_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs file, both cid and pid refs must be present - pid_ref_abs_path = store.get_refs_path("pid", pid) + pid_ref_abs_path = store.resolve_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store.get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) @@ -459,11 +459,11 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi # Write the wrong pid into a cid refs file and move it where it needs to be tmp_root_path = store.get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - cid_ref_abs_path = store.get_refs_path("cid", cid) + cid_ref_abs_path = store.resolve_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs with expected values - pid_ref_abs_path = store.get_refs_path("pid", pid) + pid_ref_abs_path = store.resolve_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store.get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) From 8899f3251ddc6dd4e35b25f5129633ac2cf14614 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 19 Jan 2024 12:08:50 -0800 Subject: [PATCH 131/420] Fix bug when skipping file sizes to test in knbvm by casting strings to integers for proper comparison --- src/hashstore/hashstoreclient.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 903476b7..a1457e46 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -603,10 +603,10 @@ def get_object_metadata_list(self, origin_directory, num, skip_obj_size=None): object_metadata_list = [] gb_files_to_skip = None if skip_obj_size is not None: - gb_files_to_skip = skip_obj_size * (1024**3) + gb_files_to_skip = int(skip_obj_size) * (1024**3) for row in rows: - size = row[6] + size = int(row[6]) if gb_files_to_skip is not None and size > gb_files_to_skip: continue else: From a921a7dabf98a8e32ceb64ebd7f6a39a51e02328 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 14:50:03 -0800 Subject: [PATCH 132/420] Tidy up logging statements in '_verify_object_information()' --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2420fbc1..96efdf20 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1488,7 +1488,7 @@ def _verify_object_information( exception_string = ( "FileHashStore - _validate_arg_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" - + f"{file_size_to_validate}." + + f" {file_size_to_validate}." ) if pid is not None: self.delete(entity, tmp_file_name) @@ -1522,7 +1522,7 @@ def _verify_object_information( # Delete the tmp file self.delete(entity, tmp_file_name) exception_string_for_pid = ( - exception_string + f". Tmp file ({tmp_file_name}) deleted." + exception_string + f" Tmp file ({tmp_file_name}) deleted." ) logging.error(exception_string_for_pid) raise ValueError(exception_string_for_pid) From d0cf9128e6ce57454e2c77f594c96228eb3b5da2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 14:56:21 -0800 Subject: [PATCH 133/420] Update 'ObjectMetadata' class attribute 'id' to 'cid' and revise all affected code, tests and documentation --- README.md | 18 +++++++------- src/hashstore/filehashstore.py | 8 +++--- src/hashstore/hashstore.py | 6 ++--- tests/test_filehashstore.py | 26 ++++++++++---------- tests/test_filehashstore_interface.py | 14 +++++------ tests/test_filehashstore_references.py | 34 +++++++++++++------------- tests/test_hashstore.py | 2 +- tests/test_hashstore_client.py | 2 +- 8 files changed, 55 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index d7f749c5..19056ee7 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ my_store = factory.get_hashstore(module_name, class_name, properties) pid = "j.tao.1700.1" object = "/path/to/your/object.data" object_metadata = my_store.store_object(pid, object) -object_cid = object_metadata.id +object_cid = object_metadata.cid # Store metadata (.../[hashstore_path]/metadata/) # By default, storing metadata will use the given properties namespace `format_id` @@ -200,28 +200,28 @@ How to use HashStore client (command line app) $ python './src/hashstore/hashstoreclient.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" # Get the checksum of a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -getchecksum -pid=content_identifier -algo=SHA-256 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -getchecksum -pid=persistent_identifier -algo=SHA-256 # Find an object (returns the content identifier) -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -findobject -pid=content_identifier +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -findobject -pid=persistent_identifier # Store a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storeobject -pid=content_identifier -path=/path/to/object +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storeobject -pid=persistent_identifier -path=/path/to/object # Store a metadata object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storemetadata -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storemetadata -pid=persistent_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 # Retrieve a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrieveobject -pid=content_identifier +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrieveobject -pid=persistent_identifier # Retrieve a metadata object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrievemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrievemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 # Delete a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deleteobject -pid=content_identifier +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deleteobject -pid=persistent_identifier # Delete a metadata file -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deletemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deletemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 ``` ## License diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 96efdf20..76e09967 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -425,7 +425,7 @@ def store_object( object_metadata = self.store_data_only(data) logging.info( "FileHashStore - store_object: Successfully stored object for cid: %s", - object_metadata.id, + object_metadata.cid, ) else: # Else the object will be stored and tagged @@ -470,7 +470,7 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) - self.tag_object(pid, object_metadata.id) + self.tag_object(pid, object_metadata.cid) logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, @@ -509,7 +509,7 @@ def verify_object( else: logging.info( "FileHashStore - verify_object: Called to verify object with id: %s", - object_metadata.id, + object_metadata.cid, ) object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size @@ -526,7 +526,7 @@ def verify_object( ) logging.info( "FileHashStore - verify_object: object has been validated for cid: %s", - object_metadata.id, + object_metadata.cid, ) def tag_object(self, pid, cid): diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 37e228d8..4611e700 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -254,7 +254,7 @@ def get_hashstore(module_name, class_name, properties=None): ) -class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])): +class ObjectMetadata(namedtuple("ObjectMetadata", ["cid", "obj_size", "hex_digests"])): """Represents metadata associated with an object. The `ObjectMetadata` class represents metadata associated with an object, @@ -268,5 +268,5 @@ class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digest """ # Default value to prevent dangerous default value - def __new__(cls, ab_id, obj_size, hex_digests=None): - return super(ObjectMetadata, cls).__new__(cls, ab_id, obj_size, hex_digests) + def __new__(cls, cid, obj_size, hex_digests=None): + return super(ObjectMetadata, cls).__new__(cls, cid, obj_size, hex_digests) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 4688ba87..1d549a6e 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -224,7 +224,7 @@ def test_store_and_validate_data_files_path(pids, store): for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert store.exists(entity, object_metadata_id) @@ -235,7 +235,7 @@ def test_store_and_validate_data_files_string(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert store.exists(entity, object_metadata_id) @@ -248,7 +248,7 @@ def test_store_and_validate_data_files_stream(pids, store): input_stream = io.open(path, "rb") object_metadata = store.store_and_validate_data(pid, input_stream) input_stream.close() - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert store.exists(entity, object_metadata_id) assert store.count(entity) == 3 @@ -259,7 +259,7 @@ def test_store_and_validate_data_cid(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert object_metadata_id == pids[pid][store.algorithm] @@ -335,7 +335,7 @@ def test_store_data_only_cid(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_data_only(path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert object_metadata_id == pids[pid][store.algorithm] @@ -821,7 +821,7 @@ def test_exists_with_object_metadata_id(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - assert store.exists(entity, object_metadata.id) + assert store.exists(entity, object_metadata.cid) def test_exists_with_sharded_path(pids, store): @@ -831,7 +831,7 @@ def test_exists_with_sharded_path(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard = store.shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) assert store.exists(entity, object_metadata_shard_path) @@ -864,7 +864,7 @@ def test_open_objects(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid io_buffer = store.open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) io_buffer.close() @@ -877,7 +877,7 @@ def test_delete_by_object_metadata_id(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid store.delete(entity, object_metadata_id) assert store.count(entity) == 0 @@ -928,7 +928,7 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard = store.shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path parent_dir = os.path.dirname(object_metadata_shard_path) @@ -992,7 +992,7 @@ def test_get_real_path_with_object_id(store, pids): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - obj_abs_path = store.resolve_path(entity, object_metadata.id) + obj_abs_path = store.resolve_path(entity, object_metadata.cid) assert os.path.exists(obj_abs_path) @@ -1003,7 +1003,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard = store.shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) obj_abs_path = store.resolve_path(entity, object_metadata_shard_path) assert os.path.exists(obj_abs_path) @@ -1030,7 +1030,7 @@ def test_get_real_path_with_bad_entity(store, pids): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) with pytest.raises(ValueError): - store.resolve_path(entity, object_metadata.id) + store.resolve_path(entity, object_metadata.cid) def test_build_path(store, pids): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index f38f46d9..5cd8b54e 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -26,7 +26,7 @@ def test_store_address_length(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - object_cid = object_metadata.id + object_cid = object_metadata.cid assert len(object_cid) == 64 @@ -37,7 +37,7 @@ def test_store_object(pids, store): for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) object_metadata = store.store_object(pid, path) - assert object_metadata.id == pids[pid][store.algorithm] + assert object_metadata.cid == pids[pid][store.algorithm] assert store.count(entity) == 3 @@ -82,7 +82,7 @@ def test_store_object_id(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - assert object_metadata.id == pids[pid][store.algorithm] + assert object_metadata.cid == pids[pid][store.algorithm] def test_store_object_obj_size(pids, store): @@ -558,7 +558,7 @@ def test_store_object_large_file(store): # Store object pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert object_metadata_id == object_metadata.hex_digests.get("sha256") @@ -577,7 +577,7 @@ def test_store_object_sparse_large_file(store): # Store object pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert object_metadata_id == object_metadata.hex_digests.get("sha256") @@ -918,7 +918,7 @@ def test_delete_object_cid_refs_file(pids, store): syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - cid = object_metadata.id + cid = object_metadata.cid store.delete_object(pid) cid_refs_file_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_refs_file_path) @@ -930,7 +930,7 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - cid = object_metadata.id + cid = object_metadata.cid cid_refs_abs_path = store.resolve_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index ccd147fd..c2fe81ee 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -12,7 +12,7 @@ def test_tag_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - object_tagged = store.tag_object(pid, object_metadata.id) + object_tagged = store.tag_object(pid, object_metadata.cid) assert object_tagged @@ -22,7 +22,7 @@ def test_tag_object_pid_refs_file(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) + store.tag_object(pid, object_metadata.cid) pid_refs_file_path = store.resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -33,7 +33,7 @@ def test_tag_object_pid_refs_file_exists(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - cid = object_metadata.id + cid = object_metadata.cid store.tag_object(pid, cid) pid_refs_file_path = store.resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -49,11 +49,11 @@ def test_tag_object_pid_refs_file_content(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) + store.tag_object(pid, object_metadata.cid) pid_refs_file_path = store.resolve_path("pid", pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() - assert pid_refs_cid == object_metadata.id + assert pid_refs_cid == object_metadata.cid def test_tag_object_cid_refs_file(pids, store): @@ -62,8 +62,8 @@ def test_tag_object_cid_refs_file(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - cid = object_metadata.id - store.tag_object(pid, object_metadata.id) + cid = object_metadata.cid + store.tag_object(pid, object_metadata.cid) cid_refs_file_path = store.resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) @@ -74,8 +74,8 @@ def test_tag_object_cid_refs_file_content(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.resolve_path("cid", object_metadata.id) + store.tag_object(pid, object_metadata.cid) + cid_refs_file_path = store.resolve_path("cid", object_metadata.cid) with open(cid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read().strip() assert pid_refs_cid == pid @@ -88,7 +88,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) + store.tag_object(pid, object_metadata.cid) another_cid = "dou.test.1" with pytest.raises(FileExistsError): store.tag_object(pid, another_cid) @@ -104,7 +104,7 @@ def test_tag_object_cid_refs_update_cid_refs_updated(store): path = test_dir + pid.replace("/", "_") # Store data only object_metadata = store.store_object(None, path) - cid = object_metadata.id + cid = object_metadata.cid # Tag object store.tag_object(pid, cid) # Tag the cid with another pid @@ -127,7 +127,7 @@ def test_tag_object_cid_refs_update_pid_refs_created(store): path = test_dir + pid.replace("/", "_") # Store data only object_metadata = store.store_object(None, path) - cid = object_metadata.id + cid = object_metadata.cid # Tag object store.tag_object(pid, cid) # Tag the cid with another pid @@ -145,8 +145,8 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) - cid = object_metadata.id + store.tag_object(pid, object_metadata.cid) + cid = object_metadata.cid # Manually update the cid refs, pid refs file missing at this point additional_pid = "dou.test.1" cid_ref_abs_path = store.resolve_path("cid", cid) @@ -204,7 +204,7 @@ def test_verify_object_exception_incorrect_size(pids, store): with pytest.raises(ValueError): store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) - cid = object_metadata.id + cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] cid_abs_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -216,7 +216,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id + cid = object_metadata.cid store.tag_object(pid, cid) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size @@ -225,7 +225,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): object_metadata, "abc123", checksum_algorithm, expected_file_size ) - cid = object_metadata.id + cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] cid_abs_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index e161c967..a2d42398 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -87,7 +87,7 @@ def test_objectmetadata(): "sha512": "sha512value", } object_metadata = ObjectMetadata(ab_id, obj_size, hex_digest_dict) - assert object_metadata.id == ab_id + assert object_metadata.cid == ab_id assert object_metadata.obj_size == obj_size assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 6eaf16d7..3aee347a 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -84,7 +84,7 @@ def test_find_object(capsys, store, pids): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - cid = object_metadata.id + cid = object_metadata.cid client_module_path = f"{client_directory}/client.py" test_store = store.root From c1d54037ee6da3f0643ad969b8686bce5fab0893 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 15:03:44 -0800 Subject: [PATCH 134/420] Add new pytest for 'delete_metadata' to confirm exception is not thrown when called to delete metadata that does not exist --- tests/test_filehashstore_interface.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 5cd8b54e..238d4e80 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -964,10 +964,20 @@ def test_delete_metadata(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - store.delete_metadata(pid, format_id) + is_deleted = store.delete_metadata(pid, format_id) + assert is_deleted assert store.count(entity) == 0 +def test_delete_metadata_does_not_exist(pids, store): + """Test delete_metadata does not throw exception when called to delete + metadata that does not exist.""" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + is_deleted = store.delete_metadata(pid, format_id) + assert is_deleted + + def test_delete_metadata_default_format_id(store, pids): """Test delete_metadata deletes successfully with default format_id.""" test_dir = "tests/testdata/" From ff8c03ff155ba11297e371751fc2a0de4576fa37 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 15:08:37 -0800 Subject: [PATCH 135/420] Revise logging statement for accuracy in 'store_data_only()' --- src/hashstore/filehashstore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 76e09967..d6dbe0ca 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -903,7 +903,7 @@ def store_data_only(self, data): size, and hex digest dictionary. """ logging.debug( - "FileHashStore - store_object: Request to store data object only." + "FileHashStore - store_data_only: Request to store data object only." ) try: @@ -924,14 +924,14 @@ def store_data_only(self, data): # The permanent address of the data stored is based on the data's checksum cid = hex_digest_dict.get(self.algorithm) logging.debug( - "FileHashStore - store_object: Successfully stored object with cid: %s", + "FileHashStore - store_data_only: Successfully stored object with cid: %s", cid, ) return object_metadata # pylint: disable=W0718 except Exception as err: exception_string = ( - "FileHashStore - store_object (store_data_only): failed to store object." + "FileHashStore - store_data_only: failed to store object." + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) From e7b746c25a656b30fb5355fd146026b6c0965698 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 16:21:09 -0800 Subject: [PATCH 136/420] Add new method '_is_pid_in_cid_refs_file' and refactor '_verify_hashstore_references', 'find_object' and 'tag_object' --- src/hashstore/filehashstore.py | 73 ++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d6dbe0ca..2aac209d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -571,7 +571,8 @@ def tag_object(self, pid, cid): self.create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists - self._update_cid_refs(cid_ref_abs_path, pid) + if not self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + self._update_cid_refs(cid_ref_abs_path, pid) self._verify_hashstore_references(pid, cid, "update") logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", @@ -616,27 +617,39 @@ def find_object(self, pid): self._check_string(pid, "pid", "find_object") pid_ref_abs_path = self.resolve_path("pid", pid) - if not os.path.exists(pid_ref_abs_path): - err_msg = ( - f"FileHashStore - find_object: pid ({pid}) reference file not found: " - + pid_ref_abs_path - ) - raise FileNotFoundError(err_msg) - else: + if os.path.exists(pid_ref_abs_path): # Read the file to get the cid from the pid reference with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() + # Confirm that the cid reference file exists cid_ref_abs_path = self.resolve_path("cid", pid_refs_cid) - if not os.path.exists(cid_ref_abs_path): + if os.path.exists(cid_ref_abs_path): + # Check that the pid is actually found in the cid reference file + if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + return pid_refs_cid + else: + # If not, it is an orphan pid refs file + err_msg = ( + "FileHashStore - find_object: pid refs file exists with cid: " + + pid_refs_cid + + f", but is missing from cid refs file: {cid_ref_abs_path}" + ) + logging.error(err_msg) + raise ValueError(err_msg) + else: err_msg = ( f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" + f", but cid refs file not found: {cid_ref_abs_path}" ) logging.error(err_msg) raise FileNotFoundError(err_msg) - else: - return pid_refs_cid + else: + err_msg = ( + f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " + + pid_ref_abs_path + ) + raise FileNotFoundError(err_msg) def store_metadata(self, pid, metadata, format_id=None): logging.debug( @@ -1233,6 +1246,23 @@ def _write_cid_refs_file(self, path, pid): logging.error(exception_string) raise err + def _is_pid_in_cid_refs_file(self, pid, cid_ref_abs_path): + """Check a cid reference file for a pid. + + :param str pid: Authority-based or persistent identifier of the object. + :param str cid_ref_abs_path: Path to the cid refs file + + :return: pid_found + :rtype: boolean + """ + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + # Confirm that pid is not currently already tagged + for line in cid_ref_file: + value = line.strip() + if pid == value: + return True + return False + def _update_cid_refs(self, cid_ref_abs_path, pid): """Update an existing CID reference file with the given PID. @@ -1253,18 +1283,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): raise FileNotFoundError(exception_string) try: - with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: - # Confirm that pid is not currently already tagged - for line in cid_ref_file: - value = line.strip() - if pid == value: - warning_msg = ( - f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" - + f" cid reference file: {cid_ref_abs_path} " - ) - logging.warning(warning_msg) - # Exit try statement, we do not want to write the pid - return + with open(cid_ref_abs_path, "a", encoding="utf8") as cid_ref_file: # Lock file for the shortest amount of time possible file_descriptor = cid_ref_file.fileno() fcntl.flock(file_descriptor, fcntl.LOCK_EX) @@ -1574,13 +1593,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): logging.error(exception_string) raise ValueError(exception_string) # Then the pid - pid_found = False - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: - for _, line in enumerate(cid_ref_file, start=1): - value = line.strip() - if value == pid: - pid_found = True - break + pid_found = self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path) if not pid_found: exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" From 8f6f879b1bc618343e6cc429ea3da06927cbb28b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 09:16:28 -0800 Subject: [PATCH 137/420] Refactor 'verify_object' to return boolean values and update docstrings & pytests --- src/hashstore/filehashstore.py | 38 ++++++++++++++++---------- src/hashstore/hashstore.py | 8 +++--- tests/test_filehashstore_references.py | 16 +++++++---- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2aac209d..52a0286f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -514,20 +514,30 @@ def verify_object( object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - self._verify_object_information( - pid=None, - checksum=checksum, - checksum_algorithm=checksum_algorithm_checked, - entity="objects", - hex_digests=object_metadata_hex_digests, - tmp_file_name=None, - tmp_file_size=object_metadata_file_size, - file_size_to_validate=expected_file_size, - ) - logging.info( - "FileHashStore - verify_object: object has been validated for cid: %s", - object_metadata.cid, - ) + + try: + self._verify_object_information( + pid=None, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + entity="objects", + hex_digests=object_metadata_hex_digests, + tmp_file_name=None, + tmp_file_size=object_metadata_file_size, + file_size_to_validate=expected_file_size, + ) + logging.info( + "FileHashStore - verify_object: object has been validated for cid: %s", + object_metadata.cid, + ) + return True + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + f"FileHashStore - verify_object: object not valid: {err}." + ) + logging.info(exception_string) + return False def tag_object(self, pid, cid): logging.debug( diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 4611e700..c5019825 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -95,7 +95,7 @@ def verify_object( :param str checksum_algorithm: Algorithm of the checksum. :param int expected_file_size: Size of the temporary file. - :return: None + :return: bool - `True` if valid """ raise NotImplementedError() @@ -258,10 +258,10 @@ class ObjectMetadata(namedtuple("ObjectMetadata", ["cid", "obj_size", "hex_diges """Represents metadata associated with an object. The `ObjectMetadata` class represents metadata associated with an object, - including a unique identifier (`id`), the size of the object in bytes (`obj_size`), - and an optional list of hex digests (`hex_digests`) to validate objects. + including a content identifier (`cid`), the size of the object in bytes (`obj_size`), + and an optional list of hex digests (`hex_digests`) to assist with validating objects. - :param str id: A unique identifier for the object (Hash ID, hex digest). + :param str cid: A unique identifier for the object (Hash ID, hex digest). :param bytes obj_size: The size of the object in bytes. :param list hex_digests: A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) (optional). diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index c2fe81ee..5ec56e59 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -201,8 +201,11 @@ def test_verify_object_exception_incorrect_size(pids, store): object_metadata = store.store_object(data=path) checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm - with pytest.raises(ValueError): - store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) + + is_valid = store.verify_object( + object_metadata, checksum, checksum_algorithm, 1000 + ) + assert not is_valid cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] @@ -220,10 +223,11 @@ def test_verify_object_exception_incorrect_checksum(pids, store): store.tag_object(pid, cid) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size - with pytest.raises(ValueError): - store.verify_object( - object_metadata, "abc123", checksum_algorithm, expected_file_size - ) + + is_valid = store.verify_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) + assert not is_valid cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] From 4830d0df1227643e1f3efe031a7eac7de5fe354d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 09:35:31 -0800 Subject: [PATCH 138/420] Refactor 'delete_object' to handle exceptions raised from calling 'find_object' --- src/hashstore/filehashstore.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 52a0286f..597ebf05 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -765,8 +765,25 @@ def delete_object(self, pid): "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) self._check_string(pid, "pid", "delete_object") - cid = self.find_object(pid) + try: + cid = self.find_object(pid) + except FileNotFoundError as fnfe: + if "pid refs file not found" in fnfe: + # Nothing to delete + return + if "cid refs file not found" in fnfe: + # Delete pid refs file + pid_ref_abs_path = self.resolve_path("pid", pid) + self.delete("pid", pid_ref_abs_path) + return + except ValueError as ve: + if "is missing from cid refs file" in ve: + # Delete pid refs file + pid_ref_abs_path = self.resolve_path("pid", pid) + self.delete("pid", pid_ref_abs_path) + return + # Proceed with next steps - cid has been retrieved without any errors while cid in self.reference_locked_cids: logging.debug( "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", @@ -783,9 +800,10 @@ def delete_object(self, pid): try: cid_ref_abs_path = self.resolve_path("cid", cid) pid_ref_abs_path = self.resolve_path("pid", pid) + # First delete the pid refs file immediately + self._delete_pid_refs_file(pid_ref_abs_path) # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) - self._delete_pid_refs_file(pid_ref_abs_path) # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: self.delete("cid", cid_ref_abs_path) From f7cffb88ab6f60cc7adfc090f23e95c8438a1e0e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:03:32 -0800 Subject: [PATCH 139/420] Revise 'find_object' to also check that cid retrieved exists before returning the cid found, and update 'delete_object' --- README.md | 8 ++++---- src/hashstore/filehashstore.py | 18 +++++++++++++++++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 19056ee7..f7ad6fb4 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Documentation is a work in progress, and can be found on the [Metacat repository ## HashStore Overview -HashStore is a content-addressable file management system that utilizes the content identifier of an object to address files. The system stores both objects, references (refs) and metadata in its respective directories and provides an API for interacting with the store. HashStore storage classes (like `FileHashStore`) must implement the HashStore interface to ensure the expected usage of HashStore. +HashStore is a content-addressable file management system that utilizes the content identifier of an object to address files. The system stores objects, references (refs) and metadata in its respective directories and provides an API for interacting with the store. HashStore storage classes (like `FileHashStore`) must implement the HashStore interface to ensure the expected usage of HashStore. ###### Public API Methods - store_object @@ -89,7 +89,7 @@ metadata_cid = my_store.store_metadata(pid, metadata, format_id) In HashStore, objects are first saved as temporary files while their content identifiers are calculated. Once the default hash algorithm list and their hashes are generated, objects are stored in their permanent location using the store's algorithm's corresponding hash value, the store depth and the store width. Lastly, reference files are created for the object so that they can be found and retrieved given an identifier (ex. persistent identifier (pid)). Note: Objects are also stored once and only once. -By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identfiier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), it will be deleted. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: +By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), it will be deleted. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: ```py # All-in-one process which stores, validates and tags an object @@ -108,8 +108,8 @@ tag_object(pid, cid) - To retrieve an object, call the Public API method `retrieve_object` which opens a stream to the object if it exists. **How do I find an object or check that it exists if I have the pid?** -- To find the location of the object, call the Public API method `find_object` which will return the content identifier (cid) of the object. -- This cid can then be used to locate the object on disk by following HashStore's store configuration. +- To check if an object exists, call the Public API method `find_object` which will return the content identifier (cid) of the object if it exists. +- If desired, this cid can then be used to locate the object on disk by following HashStore's store configuration. **How do I delete an object if I have the pid?** - To delete an object, call the Public API method `delete_object` which will delete the object and its associated references and reference files where relevant. diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 597ebf05..65a0d0b9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -637,7 +637,16 @@ def find_object(self, pid): if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): - return pid_refs_cid + # Object must also exist in order to return the cid retrieved + if not self.exists("objects", pid_refs_cid): + err_msg = ( + f"FileHashStore - find_object: Refs file found for pid ({pid}) at" + + pid_ref_abs_path + + f", but object referenced does not exist, cid: {pid_refs_cid}" + ) + raise FileNotFoundError(err_msg) + else: + return pid_refs_cid else: # If not, it is an orphan pid refs file err_msg = ( @@ -776,6 +785,11 @@ def delete_object(self, pid): pid_ref_abs_path = self.resolve_path("pid", pid) self.delete("pid", pid_ref_abs_path) return + if "object referenced does not exist" in fnfe: + # Delete pid refs file + pid_ref_abs_path = self.resolve_path("pid", pid) + self.delete("pid", pid_ref_abs_path) + return except ValueError as ve: if "is missing from cid refs file" in ve: # Delete pid refs file @@ -1974,9 +1988,11 @@ def resolve_path(self, entity, file): # Check for sharded path. if entity == "cid": + # Note, we skip checking whether the file exists for refs ref_file_abs_path = self.build_path(entity, file) return ref_file_abs_path elif entity == "pid": + # Note, we skip checking whether the file exists for refs hash_id = self.computehash(file, self.algorithm) ref_file_abs_path = self.build_path(entity, hash_id) return ref_file_abs_path From e534465f0ff127c426fbf3d348dba082aaa88fd6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:16:37 -0800 Subject: [PATCH 140/420] Refactor '.replace' calls to use '.strip' instead where relevant --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 65a0d0b9..32cb2e75 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1667,7 +1667,7 @@ def _check_arg_data(self, data): logging.error(exception_string) raise TypeError(exception_string) if isinstance(data, str): - if data.replace(" ", "") == "": + if data.strip() == "": exception_string = ( "FileHashStore - _validate_arg_data: Data string cannot be empty." ) @@ -1724,7 +1724,7 @@ def _check_arg_format_id(self, format_id, method): :rtype: str """ checked_format_id = None - if format_id is not None and format_id.replace(" ", "") == "": + if format_id and not format_id.strip(): exception_string = f"FileHashStore - {method}: Format_id cannot be empty." logging.error(exception_string) raise ValueError(exception_string) From 2025fc9ef63fa32d28032a066e77e8a04b00fee6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:23:24 -0800 Subject: [PATCH 141/420] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f7ad6fb4..b864c157 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ HashStore is a content-addressable file management system that utilizes the cont - delete_metadata - get_hex_digest -For details, please see the HashStore interface (HashStore.java) +For details, please see the HashStore interface (hashstore.py) ###### How do I create a HashStore? @@ -89,17 +89,17 @@ metadata_cid = my_store.store_metadata(pid, metadata, format_id) In HashStore, objects are first saved as temporary files while their content identifiers are calculated. Once the default hash algorithm list and their hashes are generated, objects are stored in their permanent location using the store's algorithm's corresponding hash value, the store depth and the store width. Lastly, reference files are created for the object so that they can be found and retrieved given an identifier (ex. persistent identifier (pid)). Note: Objects are also stored once and only once. -By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), it will be deleted. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: +By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), the client is expected to delete the object directly. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: ```py # All-in-one process which stores, validates and tags an object -objectMetadata objInfo = store_object(InputStream, pid, additionalAlgorithm, checksum, checksumAlgorithm, objSize) +objectMetadata objInfo = store_object(stream, pid, additional_algo, checksum, checksum_algo, objSize) # Manual Process # Store object -obj_metadata = store_object(InputStream) +obj_metadata = store_object(stream) # Validate object, throws exceptions if there is a mismatch and deletes the associated file -verify_object(objInfo, checksum, checksumAlgorithn, objSize) +verify_object(obj_metadata, checksum, checksumAlgorithn, objSize) # Tag object, makes the object discoverable (find, retrieve, delete) tag_object(pid, cid) ``` @@ -152,7 +152,7 @@ These reference files are implemented in HashStore underneath the hood with no e ###### What does HashStore look like? -``` +```shell # Example layout in HashStore with a single file stored along with its metadata and reference files. # This uses a store depth of 3, with a width of 2 and "SHA-256" as its default store algorithm ## Notes: From 63ed8d44635f5cd00918484d5719805c9e4d407d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:43:09 -0800 Subject: [PATCH 142/420] Add underscore to all non-Public API methods to reduce confusion and update pytests --- src/hashstore/filehashstore.py | 196 ++++++++++++------------- tests/test_filehashstore.py | 163 ++++++++++---------- tests/test_filehashstore_interface.py | 79 +++++----- tests/test_filehashstore_references.py | 92 ++++++------ tests/test_hashstore_client.py | 8 +- 5 files changed, 270 insertions(+), 268 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 32cb2e75..1a582f3f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -97,7 +97,7 @@ def __init__(self, properties=None): "FileHashStore - HashStore does not exist & configuration file not found." + " Writing configuration file." ) - self.write_properties(properties) + self._write_properties(properties) # Default algorithm list for FileHashStore based on config file written self._set_default_algorithms() # Complete initialization/instantiation by setting and creating store directories @@ -105,13 +105,13 @@ def __init__(self, properties=None): self.metadata = self.root + "/metadata" self.refs = self.root + "/refs" if not os.path.exists(self.objects): - self.create_path(self.objects + "/tmp") + self._create_path(self.objects + "/tmp") if not os.path.exists(self.metadata): - self.create_path(self.metadata + "/tmp") + self._create_path(self.metadata + "/tmp") if not os.path.exists(self.refs): - self.create_path(self.refs + "/tmp") - self.create_path(self.refs + "/pid") - self.create_path(self.refs + "/cid") + self._create_path(self.refs + "/tmp") + self._create_path(self.refs + "/pid") + self._create_path(self.refs + "/cid") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -126,7 +126,7 @@ def __init__(self, properties=None): # Configuration and Related Methods - def load_properties(self): + def _load_properties(self): """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): @@ -160,7 +160,7 @@ def load_properties(self): ) return hashstore_yaml_dict - def write_properties(self, properties): + def _write_properties(self, properties): """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. @@ -212,7 +212,7 @@ def write_properties(self, properties): # If given store path doesn't exist yet, create it. if not os.path.exists(self.root): - self.create_path(self.root) + self._create_path(self.root) # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( @@ -303,7 +303,7 @@ def _verify_hashstore_properties(self, properties, prop_store_path): self.hashstore_configuration_yaml, ) # If 'hashstore.yaml' is found, verify given properties before init - hashstore_yaml_dict = self.load_properties() + hashstore_yaml_dict = self._load_properties() for key in self.property_required_keys: # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` if key != "store_path": @@ -422,7 +422,7 @@ def store_object( if pid is None and self._check_arg_data(data): # If no pid is supplied, store the object only without tagging logging.debug("FileHashStore - store_object: Request to store data only.") - object_metadata = self.store_data_only(data) + object_metadata = self._store_data_only(data) logging.info( "FileHashStore - store_object: Successfully stored object for cid: %s", object_metadata.cid, @@ -462,7 +462,7 @@ def store_object( "FileHashStore - store_object: Attempting to store object for pid: %s", pid, ) - object_metadata = self.store_and_validate_data( + object_metadata = self._store_and_validate_data( pid, data, additional_algorithm=additional_algorithm_checked, @@ -513,7 +513,7 @@ def verify_object( ) object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) try: self._verify_object_information( @@ -562,9 +562,9 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - pid_ref_abs_path = self.resolve_path("pid", pid) - cid_ref_abs_path = self.resolve_path("cid", cid) - tmp_root_path = self.get_store_path("refs") / "tmp" + pid_ref_abs_path = self._resolve_path("pid", pid) + cid_ref_abs_path = self._resolve_path("cid", cid) + tmp_root_path = self._get_store_path("refs") / "tmp" # Proceed to tagging process if os.path.exists(pid_ref_abs_path): @@ -578,7 +578,7 @@ def tag_object(self, pid, cid): elif os.path.exists(cid_ref_abs_path): # Create the pid refs file pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) - self.create_path(os.path.dirname(pid_ref_abs_path)) + self._create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists if not self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): @@ -596,8 +596,8 @@ def tag_object(self, pid, cid): pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) cid_tmp_file_path = self._write_cid_refs_file(tmp_root_path, pid) # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self.create_path(os.path.dirname(pid_ref_abs_path)) - self.create_path(os.path.dirname(cid_ref_abs_path)) + self._create_path(os.path.dirname(pid_ref_abs_path)) + self._create_path(os.path.dirname(cid_ref_abs_path)) # Move both files shutil.move(pid_tmp_file_path, pid_ref_abs_path) shutil.move(cid_tmp_file_path, cid_ref_abs_path) @@ -626,19 +626,19 @@ def find_object(self, pid): ) self._check_string(pid, "pid", "find_object") - pid_ref_abs_path = self.resolve_path("pid", pid) + pid_ref_abs_path = self._resolve_path("pid", pid) if os.path.exists(pid_ref_abs_path): # Read the file to get the cid from the pid reference with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() # Confirm that the cid reference file exists - cid_ref_abs_path = self.resolve_path("cid", pid_refs_cid) + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): # Object must also exist in order to return the cid retrieved - if not self.exists("objects", pid_refs_cid): + if not self._exists("objects", pid_refs_cid): err_msg = ( f"FileHashStore - find_object: Refs file found for pid ({pid}) at" + pid_ref_abs_path @@ -700,7 +700,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) - metadata_cid = self.put_metadata(metadata, pid, checked_format_id) + metadata_cid = self._put_metadata(metadata, pid, checked_format_id) logging.info( "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", @@ -731,7 +731,7 @@ def retrieve_object(self, pid): "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", pid, ) - obj_stream = self.open(entity, object_cid) + obj_stream = self._open(entity, object_cid) else: exception_string = ( f"FileHashStore - retrieve_object: No object found for pid: {pid}" @@ -753,10 +753,10 @@ def retrieve_metadata(self, pid, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" - metadata_cid = self.computehash(pid + checked_format_id) - metadata_exists = self.exists(entity, metadata_cid) + metadata_cid = self._computehash(pid + checked_format_id) + metadata_exists = self._exists(entity, metadata_cid) if metadata_exists: - metadata_stream = self.open(entity, metadata_cid) + metadata_stream = self._open(entity, metadata_cid) else: exception_string = ( f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" @@ -782,19 +782,19 @@ def delete_object(self, pid): return if "cid refs file not found" in fnfe: # Delete pid refs file - pid_ref_abs_path = self.resolve_path("pid", pid) - self.delete("pid", pid_ref_abs_path) + pid_ref_abs_path = self._resolve_path("pid", pid) + self._delete("pid", pid_ref_abs_path) return if "object referenced does not exist" in fnfe: # Delete pid refs file - pid_ref_abs_path = self.resolve_path("pid", pid) - self.delete("pid", pid_ref_abs_path) + pid_ref_abs_path = self._resolve_path("pid", pid) + self._delete("pid", pid_ref_abs_path) return except ValueError as ve: if "is missing from cid refs file" in ve: # Delete pid refs file - pid_ref_abs_path = self.resolve_path("pid", pid) - self.delete("pid", pid_ref_abs_path) + pid_ref_abs_path = self._resolve_path("pid", pid) + self._delete("pid", pid_ref_abs_path) return # Proceed with next steps - cid has been retrieved without any errors @@ -812,16 +812,16 @@ def delete_object(self, pid): ) self.reference_locked_cids.append(cid) try: - cid_ref_abs_path = self.resolve_path("cid", cid) - pid_ref_abs_path = self.resolve_path("pid", pid) + cid_ref_abs_path = self._resolve_path("cid", cid) + pid_ref_abs_path = self._resolve_path("pid", pid) # First delete the pid refs file immediately self._delete_pid_refs_file(pid_ref_abs_path) # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: - self.delete("cid", cid_ref_abs_path) - self.delete("objects", cid) + self._delete("cid", cid_ref_abs_path) + self._delete("objects", cid) info_string = ( "FileHashStore - delete_object: Successfully deleted references and" + f" object associated with pid: {pid}" @@ -853,8 +853,8 @@ def delete_metadata(self, pid, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") entity = "metadata" - metadata_cid = self.computehash(pid + checked_format_id) - self.delete(entity, metadata_cid) + metadata_cid = self._computehash(pid + checked_format_id) + self._delete(entity, metadata_cid) logging.info( "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", @@ -871,16 +871,16 @@ def get_hex_digest(self, pid, algorithm): self._check_string(algorithm, "algorithm", "get_hex_digest") entity = "objects" - algorithm = self.clean_algorithm(algorithm) + algorithm = self._clean_algorithm(algorithm) object_cid = self.find_object(pid) - if not self.exists(entity, object_cid): + if not self._exists(entity, object_cid): exception_string = ( f"FileHashStore - get_hex_digest: No object found for pid: {pid}" ) logging.error(exception_string) raise ValueError(exception_string) - cid_stream = self.open(entity, object_cid) - hex_digest = self.computehash(cid_stream, algorithm=algorithm) + cid_stream = self._open(entity, object_cid) + hex_digest = self._computehash(cid_stream, algorithm=algorithm) info_string = ( f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." @@ -891,7 +891,7 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def store_and_validate_data( + def _store_and_validate_data( self, pid, file, @@ -944,7 +944,7 @@ def store_and_validate_data( ) return object_metadata - def store_data_only(self, data): + def _store_data_only(self, data): """Store an object to HashStore and return the ID and a hex digest dictionary of the default algorithms. This method does not validate the object and writes directly to `/objects` after the hex digests are calculated. @@ -958,7 +958,7 @@ def store_data_only(self, data): size, and hex digest dictionary. """ logging.debug( - "FileHashStore - store_data_only: Request to store data object only." + "FileHashStore - _store_data_only: Request to store data object only." ) try: @@ -979,14 +979,14 @@ def store_data_only(self, data): # The permanent address of the data stored is based on the data's checksum cid = hex_digest_dict.get(self.algorithm) logging.debug( - "FileHashStore - store_data_only: Successfully stored object with cid: %s", + "FileHashStore - _store_data_only: Successfully stored object with cid: %s", cid, ) return object_metadata # pylint: disable=W0718 except Exception as err: exception_string = ( - "FileHashStore - store_data_only: failed to store object." + "FileHashStore - _store_data_only: failed to store object." + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1043,7 +1043,7 @@ def _move_and_get_checksums( # Objects are stored with their content identifier based on the store algorithm entity = "objects" object_cid = hex_digests.get(self.algorithm) - abs_file_path = self.build_path(entity, object_cid, extension) + abs_file_path = self._build_path(entity, object_cid, extension) # Only move file if it doesn't exist. We do not check before we create the tmp # file and calculate the hex digests because the given checksum could be incorrect. @@ -1059,7 +1059,7 @@ def _move_and_get_checksums( tmp_file_size, file_size_to_validate, ) - self.create_path(os.path.dirname(abs_file_path)) + self._create_path(os.path.dirname(abs_file_path)) try: debug_msg = ( "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" @@ -1096,12 +1096,12 @@ def _move_and_get_checksums( + f" found but with incomplete state, deleting file: {abs_file_path}", ) logging.debug(debug_msg) - self.delete(entity, abs_file_path) + self._delete(entity, abs_file_path) logging.debug( "FileHashStore - _move_and_get_checksums: Deleting temporary file: %s", tmp_file_name, ) - self.delete(entity, tmp_file_name) + self._delete(entity, tmp_file_name) err_msg = ( "Aborting store_object upload - an unexpected error has occurred when moving" + f" file to: {object_cid} - Error: {err}" @@ -1131,7 +1131,7 @@ def _move_and_get_checksums( raise FileExistsError from ge finally: # Delete the temporary file, it already exists so it is redundant - self.delete(entity, tmp_file_name) + self._delete(entity, tmp_file_name) return object_cid, tmp_file_size, hex_digests @@ -1155,7 +1155,7 @@ def _write_to_tmp_file_and_get_hex_digests( algorithm_list_to_calculate = self._refine_algorithm_list( additional_algorithm, checksum_algorithm ) - tmp_root_path = self.get_store_path("objects") / "tmp" + tmp_root_path = self._get_store_path("objects") / "tmp" tmp = self._mktmpfile(tmp_root_path) logging.debug( @@ -1235,7 +1235,7 @@ def _mktmpfile(self, path): """ # Physically create directory if it doesn't exist if os.path.exists(path) is False: - self.create_path(path) + self._create_path(path) tmp = NamedTemporaryFile(dir=path, delete=False) @@ -1425,7 +1425,7 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): ) raise FileNotFoundError(err_msg) else: - self.delete("pid", pid_ref_abs_path) + self._delete("pid", pid_ref_abs_path) except Exception as err: exception_string = ( @@ -1435,7 +1435,7 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): logging.error(exception_string) raise err - def put_metadata(self, metadata, pid, format_id): + def _put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. @@ -1447,7 +1447,7 @@ def put_metadata(self, metadata, pid, format_id): :rtype: str """ logging.debug( - "FileHashStore - put_metadata: Request to put metadata for pid: %s", pid + "FileHashStore - _put_metadata: Request to put metadata for pid: %s", pid ) # Create metadata tmp file and write to it metadata_stream = Stream(metadata) @@ -1455,9 +1455,9 @@ def put_metadata(self, metadata, pid, format_id): metadata_tmp = self._mktmpmetadata(metadata_stream) # Get target and related paths (permanent location) - metadata_cid = self.computehash(pid + format_id) - rel_path = "/".join(self.shard(metadata_cid)) - full_path = self.get_store_path("metadata") / rel_path + metadata_cid = self._computehash(pid + format_id) + rel_path = "/".join(self._shard(metadata_cid)) + full_path = self._get_store_path("metadata") / rel_path # Move metadata to target path if os.path.exists(metadata_tmp): @@ -1467,26 +1467,26 @@ def put_metadata(self, metadata, pid, format_id): # Metadata will be replaced if it exists shutil.move(metadata_tmp, full_path) logging.debug( - "FileHashStore - put_metadata: Successfully put metadata for pid: %s", + "FileHashStore - _put_metadata: Successfully put metadata for pid: %s", pid, ) return metadata_cid except Exception as err: exception_string = ( - f"FileHashStore - put_metadata: Unexpected {err=}, {type(err)=}" + f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) if os.path.exists(metadata_tmp): # Remove tmp metadata, calling app must re-upload logging.debug( - "FileHashStore - put_metadata: Deleting metadata for pid: %s", + "FileHashStore - _put_metadata: Deleting metadata for pid: %s", pid, ) self.metadata.delete(metadata_tmp) raise else: exception_string = ( - f"FileHashStore - put_metadata: Attempt to move metadata for pid: {pid}" + f"FileHashStore - _put_metadata: Attempt to move metadata for pid: {pid}" + f", but metadata temp file not found: {metadata_tmp}" ) logging.error(exception_string) @@ -1501,7 +1501,7 @@ def _mktmpmetadata(self, stream): :rtype: str """ # Create temporary file in .../{store_path}/tmp - tmp_root_path = self.get_store_path("metadata") / "tmp" + tmp_root_path = self._get_store_path("metadata") / "tmp" tmp = self._mktmpfile(tmp_root_path) # tmp is a file-like object that is already opened for writing by default @@ -1552,7 +1552,7 @@ def _verify_object_information( + f" {file_size_to_validate}." ) if pid is not None: - self.delete(entity, tmp_file_name) + self._delete(entity, tmp_file_name) exception_string_for_pid = ( exception_string + f" Tmp file deleted and file not stored for pid: {pid}" @@ -1581,7 +1581,7 @@ def _verify_object_information( ) if pid is not None: # Delete the tmp file - self.delete(entity, tmp_file_name) + self._delete(entity, tmp_file_name) exception_string_for_pid = ( exception_string + f" Tmp file ({tmp_file_name}) deleted." ) @@ -1590,8 +1590,8 @@ def _verify_object_information( else: # Delete the object cid = hex_digests[self.algorithm] - cid_abs_path = self.resolve_path("cid", cid) - self.delete(entity, cid_abs_path) + cid_abs_path = self._resolve_path("cid", cid) + self._delete(entity, cid_abs_path) logging.error(exception_string) raise ValueError(exception_string) @@ -1604,8 +1604,8 @@ def _verify_hashstore_references(self, pid, cid, verify_type): :param str verify_type: "update" or "create" """ # Check that reference files were created - pid_ref_abs_path = self.resolve_path("pid", pid) - cid_ref_abs_path = self.resolve_path("cid", cid) + pid_ref_abs_path = self._resolve_path("pid", pid) + cid_ref_abs_path = self._resolve_path("cid", cid) if not os.path.exists(pid_ref_abs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " @@ -1695,7 +1695,7 @@ def _check_arg_algorithms_and_checksum( additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: # Set additional_algorithm - additional_algorithm_checked = self.clean_algorithm(additional_algorithm) + additional_algorithm_checked = self._clean_algorithm(additional_algorithm) checksum_algorithm_checked = None if checksum is not None: self._check_string( @@ -1710,7 +1710,7 @@ def _check_arg_algorithms_and_checksum( "_check_arg_algorithms_and_checksum (store_object)", ) # Set checksum_algorithm - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked def _check_arg_format_id(self, format_id, method): @@ -1746,7 +1746,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): """ algorithm_list_to_calculate = self.default_algo_list if checksum_algorithm is not None: - self.clean_algorithm(checksum_algorithm) + self._clean_algorithm(checksum_algorithm) if checksum_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" @@ -1755,7 +1755,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): logging.debug(debug_additional_other_algo_str) algorithm_list_to_calculate.append(checksum_algorithm) if additional_algorithm is not None: - self.clean_algorithm(additional_algorithm) + self._clean_algorithm(additional_algorithm) if additional_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" @@ -1768,7 +1768,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): algorithm_list_to_calculate = set(algorithm_list_to_calculate) return algorithm_list_to_calculate - def clean_algorithm(self, algorithm_string): + def _clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with the Python `hashlib` library. @@ -1791,14 +1791,14 @@ def clean_algorithm(self, algorithm_string): and cleaned_string not in self.other_algo_list ): exception_string = ( - "FileHashStore: clean_algorithm: Algorithm not supported:" + "FileHashStore: _clean_algorithm: Algorithm not supported:" + cleaned_string ) logging.error(exception_string) raise ValueError(exception_string) return cleaned_string - def computehash(self, stream, algorithm=None): + def _computehash(self, stream, algorithm=None): """Compute the hash of a file-like object (or string) using the store algorithm by default or with an optional supported algorithm. @@ -1812,14 +1812,14 @@ def computehash(self, stream, algorithm=None): if algorithm is None: hashobj = hashlib.new(self.algorithm) else: - check_algorithm = self.clean_algorithm(algorithm) + check_algorithm = self._clean_algorithm(algorithm) hashobj = hashlib.new(check_algorithm) for data in stream: hashobj.update(self._cast_to_bytes(data)) hex_digest = hashobj.hexdigest() return hex_digest - def exists(self, entity, file): + def _exists(self, entity, file): """Check whether a given file id or path exists on disk. :param str entity: Desired entity type (e.g., "objects", "metadata"). @@ -1828,10 +1828,10 @@ def exists(self, entity, file): :return: True if the file exists. :rtype: bool """ - file_exists = bool(self.resolve_path(entity, file)) + file_exists = bool(self._resolve_path(entity, file)) return file_exists - def shard(self, digest): + def _shard(self, digest): """Generates a list given a digest of `self.depth` number of tokens with width `self.width` from the first part of the digest plus the remainder. @@ -1857,7 +1857,7 @@ def compact(items): return hierarchical_list - def open(self, entity, file, mode="rb"): + def _open(self, entity, file, mode="rb"): """Return open buffer object from given id or path. Caller is responsible for closing the stream. @@ -1868,7 +1868,7 @@ def open(self, entity, file, mode="rb"): :return: An `io` stream dependent on the `mode`. :rtype: io.BufferedReader """ - realpath = self.resolve_path(entity, file) + realpath = self._resolve_path(entity, file) if realpath is None: raise IOError(f"Could not locate file: {file}") @@ -1877,14 +1877,14 @@ def open(self, entity, file, mode="rb"): buffer = io.open(realpath, mode) return buffer - def delete(self, entity, file): + def _delete(self, entity, file): """Delete file using id or path. Remove any empty directories after deleting. No exception is raised if file doesn't exist. :param str entity: Desired entity type (ex. "objects", "metadata"). :param str file: Address ID or path of file. """ - realpath = self.resolve_path(entity, file) + realpath = self._resolve_path(entity, file) if realpath is None: return None @@ -1929,7 +1929,7 @@ def _has_subdir(self, path): is_subdir = subpath.startswith(root_path) return is_subdir - def create_path(self, path): + def _create_path(self, path): """Physically create the folder path (and all intermediate ones) on disk. :param str path: The path to create. @@ -1940,7 +1940,7 @@ def create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" - def build_path(self, entity, hash_id, extension=""): + def _build_path(self, entity, hash_id, extension=""): """Build the absolute file path for a given hash ID with an optional file extension. :param str entity: Desired entity type (ex. "objects", "metadata"). @@ -1950,8 +1950,8 @@ def build_path(self, entity, hash_id, extension=""): :return: An absolute file path for the specified hash ID. :rtype: str """ - paths = self.shard(hash_id) - root_dir = self.get_store_path(entity) + paths = self._shard(hash_id) + root_dir = self._get_store_path(entity) if extension and not extension.startswith(os.extsep): extension = os.extsep + extension @@ -1961,7 +1961,7 @@ def build_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path - def resolve_path(self, entity, file): + def _resolve_path(self, entity, file): """Attempt to determine the absolute path of a file ID or path through successive checking of candidate paths. @@ -1989,19 +1989,19 @@ def resolve_path(self, entity, file): # Check for sharded path. if entity == "cid": # Note, we skip checking whether the file exists for refs - ref_file_abs_path = self.build_path(entity, file) + ref_file_abs_path = self._build_path(entity, file) return ref_file_abs_path elif entity == "pid": # Note, we skip checking whether the file exists for refs - hash_id = self.computehash(file, self.algorithm) - ref_file_abs_path = self.build_path(entity, hash_id) + hash_id = self._computehash(file, self.algorithm) + ref_file_abs_path = self._build_path(entity, hash_id) return ref_file_abs_path else: - abspath = self.build_path(entity, file) + abspath = self._build_path(entity, file) if os.path.isfile(abspath): return abspath - def get_store_path(self, entity): + def _get_store_path(self, entity): """Return a path object of the root directory of the store. :param str entity: Desired entity type: "objects" or "metadata" @@ -2024,7 +2024,7 @@ def get_store_path(self, entity): f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) - def count(self, entity): + def _count(self, entity): """Return the count of the number of files in the `root` directory. :param str entity: Desired entity type (ex. "objects", "metadata"). diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1d549a6e..2aafec81 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -5,8 +5,7 @@ import pytest from hashstore.filehashstore import FileHashStore - -# Tests for HashStore Configuration and Related Methods +# pylint: disable=W0212 def test_pids_length(pids): @@ -121,7 +120,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.store_and_validate_data(pid, path) + store._store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) properties = { "store_path": store.root, @@ -135,8 +134,8 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): def test_load_properties(store): - """Verify dictionary returned from load_properties matches initialization.""" - hashstore_yaml_dict = store.load_properties() + """Verify dictionary returned from _load_properties matches initialization.""" + hashstore_yaml_dict = store._load_properties() assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" @@ -150,7 +149,7 @@ def test_load_properties_hashstore_yaml_missing(store): """Confirm FileNotFoundError is raised when hashstore.yaml does not exist.""" os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): - store.load_properties() + store._load_properties() def test_validate_properties(store): @@ -207,7 +206,7 @@ def test_set_default_algorithms_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.store_and_validate_data(pid, path) + store._store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): # pylint: disable=W0212 @@ -218,67 +217,67 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): - """Test store_and_validate_data objects with path object for the path arg.""" + """Test _store_and_validate_data objects with path object for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid - assert store.exists(entity, object_metadata_id) + assert store._exists(entity, object_metadata_id) def test_store_and_validate_data_files_string(pids, store): - """Test store_and_validate_data objects with string for the path arg.""" + """Test _store_and_validate_data objects with string for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid - assert store.exists(entity, object_metadata_id) + assert store._exists(entity, object_metadata_id) def test_store_and_validate_data_files_stream(pids, store): - """Test store_and_validate_data objects with stream for the path arg.""" + """Test _store_and_validate_data objects with stream for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - object_metadata = store.store_and_validate_data(pid, input_stream) + object_metadata = store._store_and_validate_data(pid, input_stream) input_stream.close() object_metadata_id = object_metadata.cid - assert store.exists(entity, object_metadata_id) - assert store.count(entity) == 3 + assert store._exists(entity, object_metadata_id) + assert store._count(entity) == 3 def test_store_and_validate_data_cid(pids, store): - """Check store_and_validate_data returns correct id.""" + """Check _store_and_validate_data returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid assert object_metadata_id == pids[pid][store.algorithm] def test_store_and_validate_data_file_size(pids, store): - """Check store_and_validate_data returns correct file size.""" + """Check _store_and_validate_data returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_size = object_metadata.obj_size assert object_size == pids[pid]["file_size_bytes"] def test_store_and_validate_data_hex_digests(pids, store): - """Check store_and_validate_data successfully generates hex digests dictionary.""" + """Check _store_and_validate_data successfully generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_hex_digests = object_metadata.hex_digests assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] @@ -288,12 +287,12 @@ def test_store_and_validate_data_hex_digests(pids, store): def test_store_and_validate_data_additional_algorithm(pids, store): - """Check store_and_validate_data returns additional algorithm in hex digests.""" + """Check _store_and_validate_data returns additional algorithm in hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data( + object_metadata = store._store_and_validate_data( pid, path, additional_algorithm=algo ) hex_digests = object_metadata.hex_digests @@ -302,20 +301,20 @@ def test_store_and_validate_data_additional_algorithm(pids, store): def test_store_and_validate_data_with_correct_checksums(pids, store): - """Check store_and_validate_data with valid checksum and checksum algorithm supplied.""" + """Check _store_and_validate_data with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" algo_checksum = pids[pid][algo] path = test_dir + pid.replace("/", "_") - store.store_and_validate_data( + store._store_and_validate_data( pid, path, checksum=algo_checksum, checksum_algorithm=algo ) - assert store.count("objects") == 3 + assert store._count("objects") == 3 def test_store_and_validate_data_with_incorrect_checksum(pids, store): - """Check store_and_validate_data fails when a bad checksum supplied.""" + """Check _store_and_validate_data fails when a bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -323,38 +322,38 @@ def test_store_and_validate_data_with_incorrect_checksum(pids, store): algo_checksum = "badChecksumValue" path = test_dir + pid.replace("/", "_") with pytest.raises(ValueError): - store.store_and_validate_data( + store._store_and_validate_data( pid, path, checksum=algo_checksum, checksum_algorithm=algo ) - assert store.count(entity) == 0 + assert store._count(entity) == 0 def test_store_data_only_cid(pids, store): - """Check store_data_only returns correct id.""" + """Check _store_data_only returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_data_only(path) + object_metadata = store._store_data_only(path) object_metadata_id = object_metadata.cid assert object_metadata_id == pids[pid][store.algorithm] def test_store_data_only_file_size(pids, store): - """Check store_data_only returns correct file size.""" + """Check _store_data_only returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_data_only(path) + object_metadata = store._store_data_only(path) object_size = object_metadata.obj_size assert object_size == pids[pid]["file_size_bytes"] def test_store_data_only_hex_digests(pids, store): - """Check store_data_only generates hex digests dictionary.""" + """Check _store_data_only generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_data_only(path) + object_metadata = store._store_data_only(path) object_metadata_hex_digests = object_metadata.hex_digests assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] @@ -437,7 +436,7 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): checksum_algorithm="sha256", ) input_stream.close() - assert store.count(entity) == 3 + assert store._count(entity) == 3 def test_move_and_get_checksums_incorrect_file_size(pids, store): @@ -607,23 +606,23 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, def test_mktmpfile(store): """Test that _mktmpfile creates and returns a tmp file.""" path = store.root + "/doutest/tmp/" - store.create_path(path) + store._create_path(path) # pylint: disable=W0212 tmp = store._mktmpfile(path) assert os.path.exists(tmp.name) def test_put_metadata_with_path(pids, store): - """Test put_metadata with path object for the path arg.""" + """Test _put_metadata with path object for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.put_metadata(syspath, pid, format_id) - assert store.exists(entity, metadata_cid) - assert store.count(entity) == 3 + metadata_cid = store._put_metadata(syspath, pid, format_id) + assert store._exists(entity, metadata_cid) + assert store._count(entity) == 3 def test_put_metadata_with_string(pids, store): @@ -634,9 +633,9 @@ def test_put_metadata_with_string(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - metadata_cid = store.put_metadata(syspath, pid, format_id) - assert store.exists(entity, metadata_cid) - assert store.count(entity) == 3 + metadata_cid = store._put_metadata(syspath, pid, format_id) + assert store._exists(entity, metadata_cid) + assert store._count(entity) == 3 def test_put_metadata_cid(pids, store): @@ -646,7 +645,7 @@ def test_put_metadata_cid(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.put_metadata(syspath, pid, format_id) + metadata_cid = store._put_metadata(syspath, pid, format_id) assert metadata_cid == pids[pid]["metadata_cid"] @@ -771,9 +770,9 @@ def test_clean_algorithm(store): algorithm_underscore = "sha_256" algorithm_hyphen = "sha-256" algorithm_other_hyphen = "sha3-256" - cleaned_algo_underscore = store.clean_algorithm(algorithm_underscore) - cleaned_algo_hyphen = store.clean_algorithm(algorithm_hyphen) - cleaned_algo_other_hyphen = store.clean_algorithm(algorithm_other_hyphen) + cleaned_algo_underscore = store._clean_algorithm(algorithm_underscore) + cleaned_algo_hyphen = store._clean_algorithm(algorithm_hyphen) + cleaned_algo_other_hyphen = store._clean_algorithm(algorithm_other_hyphen) assert cleaned_algo_underscore == "sha256" assert cleaned_algo_hyphen == "sha256" assert cleaned_algo_other_hyphen == "sha3_256" @@ -785,7 +784,7 @@ def test_computehash(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") obj_stream = io.open(path, "rb") - obj_sha256_hash = store.computehash(obj_stream, "sha256") + obj_sha256_hash = store._computehash(obj_stream, "sha256") obj_stream.close() assert pids[pid]["sha256"] == obj_sha256_hash @@ -793,7 +792,7 @@ def test_computehash(pids, store): def test_get_store_path_object(store): """Check get_store_path for object path.""" # pylint: disable=W0212 - path_objects = store.get_store_path("objects") + path_objects = store._get_store_path("objects") path_objects_string = str(path_objects) assert path_objects_string.endswith("/metacat/objects") @@ -801,7 +800,7 @@ def test_get_store_path_object(store): def test_get_store_path_metadata(store): """Check get_store_path for metadata path.""" # pylint: disable=W0212 - path_metadata = store.get_store_path("metadata") + path_metadata = store._get_store_path("metadata") path_metadata_string = str(path_metadata) assert path_metadata_string.endswith("/metacat/metadata") @@ -809,7 +808,7 @@ def test_get_store_path_metadata(store): def test_get_store_path_refs(store): """Check get_store_path for refs path.""" # pylint: disable=W0212 - path_metadata = store.get_store_path("refs") + path_metadata = store._get_store_path("refs") path_metadata_string = str(path_metadata) assert path_metadata_string.endswith("/metacat/refs") @@ -820,8 +819,8 @@ def test_exists_with_object_metadata_id(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - assert store.exists(entity, object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + assert store._exists(entity, object_metadata.cid) def test_exists_with_sharded_path(pids, store): @@ -830,17 +829,17 @@ def test_exists_with_sharded_path(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + object_metadata_shard = store._shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) - assert store.exists(entity, object_metadata_shard_path) + assert store._exists(entity, object_metadata_shard_path) def test_exists_with_nonexistent_file(store): """Test exists method with a nonexistent file.""" entity = "objects" non_existent_file = "tests/testdata/filedoesnotexist" - does_not_exist = store.exists(entity, non_existent_file) + does_not_exist = store._exists(entity, non_existent_file) assert does_not_exist is False @@ -853,7 +852,7 @@ def test_shard(store): "5e", "d77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", ] - sharded_list = store.shard(hash_id) + sharded_list = store._shard(hash_id) assert predefined_list == sharded_list @@ -863,9 +862,9 @@ def test_open_objects(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid - io_buffer = store.open(entity, object_metadata_id) + io_buffer = store._open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) io_buffer.close() @@ -876,10 +875,10 @@ def test_delete_by_object_metadata_id(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid - store.delete(entity, object_metadata_id) - assert store.count(entity) == 0 + store._delete(entity, object_metadata_id) + assert store._count(entity) == 0 def test_remove_empty_removes_empty_folders_string(store): @@ -927,8 +926,8 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + object_metadata_shard = store._shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path parent_dir = os.path.dirname(object_metadata_shard_path) @@ -973,7 +972,7 @@ def test_create_path(pids, store): root_directory = store.root pid_hex_digest_directory = pids[pid]["metadata_cid"][:2] pid_directory = root_directory + pid_hex_digest_directory - store.create_path(pid_directory) + store._create_path(pid_directory) assert os.path.isdir(pid_directory) @@ -981,7 +980,7 @@ def test_get_real_path_file_does_not_exist(store): """Test get_real_path returns None when object does not exist.""" entity = "objects" test_path = "tests/testdata/helloworld.txt" - real_path_exists = store.resolve_path(entity, test_path) + real_path_exists = store._resolve_path(entity, test_path) assert real_path_exists is None @@ -991,8 +990,8 @@ def test_get_real_path_with_object_id(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - obj_abs_path = store.resolve_path(entity, object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + obj_abs_path = store._resolve_path(entity, object_metadata.cid) assert os.path.exists(obj_abs_path) @@ -1002,10 +1001,10 @@ def test_get_real_path_with_object_id_sharded(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + object_metadata_shard = store._shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) - obj_abs_path = store.resolve_path(entity, object_metadata_shard_path) + obj_abs_path = store._resolve_path(entity, object_metadata_shard_path) assert os.path.exists(obj_abs_path) @@ -1018,7 +1017,7 @@ def test_get_real_path_with_metadata_id(store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_abs_path = store.resolve_path(entity, metadata_cid) + metadata_abs_path = store._resolve_path(entity, metadata_cid) assert os.path.exists(metadata_abs_path) @@ -1028,9 +1027,9 @@ def test_get_real_path_with_bad_entity(store, pids): entity = "bad_entity" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) with pytest.raises(ValueError): - store.resolve_path(entity, object_metadata.cid) + store._resolve_path(entity, object_metadata.cid) def test_build_path(store, pids): @@ -1039,9 +1038,9 @@ def test_build_path(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _ = store.store_and_validate_data(pid, path) + _ = store._store_and_validate_data(pid, path) # pylint: disable=W0212 - abs_path = store.build_path(entity, pids[pid][store.algorithm]) + abs_path = store._build_path(entity, pids[pid][store.algorithm]) assert os.path.exists(abs_path) @@ -1051,8 +1050,8 @@ def test_count(pids, store): entity = "objects" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") - store.store_and_validate_data(pid, path_string) - assert store.count(entity) == 3 + store._store_and_validate_data(pid, path_string) + assert store._count(entity) == 3 def test_cast_to_bytes(store): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 238d4e80..40dc99f0 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -8,6 +8,9 @@ import time import pytest +# pylint: disable=W0212 + + # Define a mark to be used to label slow tests slow_test = pytest.mark.skipif( "not config.getoption('--run-slow')", @@ -38,7 +41,7 @@ def test_store_object(pids, store): path = Path(test_dir + pid.replace("/", "_")) object_metadata = store.store_object(pid, path) assert object_metadata.cid == pids[pid][store.algorithm] - assert store.count(entity) == 3 + assert store._count(entity) == 3 def test_store_object_files_path(pids, store): @@ -48,8 +51,8 @@ def test_store_object_files_path(pids, store): for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) _object_metadata = store.store_object(pid, path) - assert store.exists(entity, pids[pid][store.algorithm]) - assert store.count(entity) == 3 + assert store._exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 3 def test_store_object_files_string(pids, store): @@ -59,8 +62,8 @@ def test_store_object_files_string(pids, store): for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") _object_metadata = store.store_object(pid, path_string) - assert store.exists(entity, pids[pid][store.algorithm]) - assert store.count(entity) == 3 + assert store._exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 3 def test_store_object_files_input_stream(pids, store): @@ -72,8 +75,8 @@ def test_store_object_files_input_stream(pids, store): input_stream = io.open(path, "rb") _object_metadata = store.store_object(pid, input_stream) input_stream.close() - assert store.exists(entity, pids[pid][store.algorithm]) - assert store.count(entity) == 3 + assert store._exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 3 def test_store_object_id(pids, store): @@ -170,7 +173,7 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._exists(entity, pids[pid][store.algorithm]) def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): @@ -186,7 +189,7 @@ def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._exists(entity, pids[pid][store.algorithm]) def test_store_object_additional_algorithm_underscore(pids, store): @@ -202,7 +205,7 @@ def test_store_object_additional_algorithm_underscore(pids, store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._exists(entity, pids[pid][store.algorithm]) def test_store_object_checksum_correct(store): @@ -218,7 +221,7 @@ def test_store_object_checksum_correct(store): _object_metadata = store.store_object( pid, path, checksum=checksum_correct, checksum_algorithm=checksum_algo ) - assert store.count(entity) == 1 + assert store._count(entity) == 1 def test_store_object_checksum_correct_and_additional_algo(store): @@ -343,7 +346,7 @@ def test_store_object_duplicate_does_not_store_duplicate(store): pid_that_refs_existing_cid = "dou.test.1" _object_metadata_two = store.store_object(pid_that_refs_existing_cid, path) # Confirm only one object exists and the tmp file created is deleted - assert store.count(entity) == 1 + assert store._count(entity) == 1 def test_store_object_duplicate_references_files(pids, store): @@ -361,11 +364,11 @@ def test_store_object_duplicate_references_files(pids, store): pid_three = "dou.test.2" _object_metadata_three = store.store_object(pid_three, path) # Confirm that there are 3 pid reference files - assert store.count("pid") == 3 + assert store._count("pid") == 3 # Confirm that there are 1 cid reference files - assert store.count("cid") == 1 + assert store._count("cid") == 1 # Confirm the content of the cid refence files - cid_ref_abs_path = store.resolve_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store._resolve_path("cid", pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -387,7 +390,7 @@ def test_store_object_duplicate_references_content(pids, store): pid_three = "dou.test.2" store.store_object(pid_three, path) # Confirm the content of the cid refence files - cid_ref_abs_path = store.resolve_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store._resolve_path("cid", pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -411,8 +414,8 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor _object_metadata_two = store.store_object( pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) - assert store.count(entity) == 1 - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 1 + assert store._exists(entity, pids[pid][store.algorithm]) def test_store_object_with_obj_file_size(store, pids): @@ -484,8 +487,8 @@ def store_object_wrapper(obj_pid, obj_path): thread2.join() thread3.join() # One thread will succeed, file count must still be 1 - assert store.count(entity) == 1 - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 1 + assert store._exists(entity, pids[pid][store.algorithm]) assert file_exists_error_flag @@ -600,7 +603,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): _object_metadata = store.store_object(pid, path) # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store.resolve_path("pid", pid) + pid_ref_abs_path = store._resolve_path("pid", pid) with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: pid_ref_file.seek(0) pid_ref_file.write("intentionally.wrong.pid") @@ -658,9 +661,9 @@ def test_store_metadata_files_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store.exists(entity, metadata_cid) + assert store._exists(entity, metadata_cid) assert metadata_cid == pids[pid]["metadata_cid"] - assert store.count(entity) == 3 + assert store._count(entity) == 3 def test_store_metadata_files_string(pids, store): @@ -672,8 +675,8 @@ def test_store_metadata_files_string(pids, store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) metadata_cid = store.store_metadata(pid, syspath_string, format_id) - assert store.exists(entity, metadata_cid) - assert store.count(entity) == 3 + assert store._exists(entity, metadata_cid) + assert store._count(entity) == 3 def test_store_metadata_files_input_stream(pids, store): @@ -687,7 +690,7 @@ def test_store_metadata_files_input_stream(pids, store): syspath_stream = io.open(syspath_string, "rb") _metadata_cid = store.store_metadata(pid, syspath_stream, format_id) syspath_stream.close() - assert store.count(entity) == 3 + assert store._count(entity) == 3 def test_store_metadata_pid_empty(store): @@ -778,7 +781,7 @@ def test_store_metadata_thread_lock(store): thread2.join() thread3.join() thread4.join() - assert store.count(entity) == 1 + assert store._count(entity) == 1 def test_retrieve_object(pids, store): @@ -792,7 +795,7 @@ def test_retrieve_object(pids, store): object_metadata = store.store_object(pid, path) store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) - sha256_hex = store.computehash(obj_stream) + sha256_hex = store._computehash(obj_stream) obj_stream.close() assert sha256_hex == object_metadata.hex_digests.get("sha256") @@ -890,7 +893,7 @@ def test_delete_object(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - assert store.count(entity) == 0 + assert store._count(entity) == 0 def test_delete_object_pid_refs_file(pids, store): @@ -904,7 +907,7 @@ def test_delete_object_pid_refs_file(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - pid_refs_file_path = store.resolve_path("pid", pid) + pid_refs_file_path = store._resolve_path("pid", pid) assert not os.path.exists(pid_refs_file_path) @@ -920,7 +923,7 @@ def test_delete_object_cid_refs_file(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) cid = object_metadata.cid store.delete_object(pid) - cid_refs_file_path = store.resolve_path("cid", cid) + cid_refs_file_path = store._resolve_path("cid", cid) assert not os.path.exists(cid_refs_file_path) @@ -931,11 +934,11 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) cid = object_metadata.cid - cid_refs_abs_path = store.resolve_path("cid", cid) + cid_refs_abs_path = store._resolve_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") store.delete_object(pid) - cid_refs_file_path = store.resolve_path("cid", cid) + cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) @@ -966,7 +969,7 @@ def test_delete_metadata(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) is_deleted = store.delete_metadata(pid, format_id) assert is_deleted - assert store.count(entity) == 0 + assert store._count(entity) == 0 def test_delete_metadata_does_not_exist(pids, store): @@ -989,7 +992,7 @@ def test_delete_metadata_default_format_id(store, pids): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath) store.delete_metadata(pid) - assert store.count(entity) == 0 + assert store._count(entity) == 0 def test_delete_metadata_pid_empty(store): @@ -1100,14 +1103,14 @@ def test_store_and_delete_objects_100_pids_1_cid(store): store.store_object(pid_modified, path) assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 100 assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 1 - assert store.count("objects") == 1 + assert store._count("objects") == 1 # Delete for i in range(1, upper_limit): pid_modified = f"dou.test.{str(i)}" store.delete_object(pid_modified) assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 - assert store.count("objects") == 0 + assert store._count("objects") == 0 def test_store_and_delete_object_300_pids_1_cid_threads(store): @@ -1159,4 +1162,4 @@ def delete_object_wrapper(pid_var): assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 - assert store.count("objects") == 0 + assert store._count("objects") == 0 diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 5ec56e59..92f5ae59 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -23,7 +23,7 @@ def test_tag_object_pid_refs_file(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store.resolve_path("pid", pid) + pid_refs_file_path = store._resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -35,9 +35,9 @@ def test_tag_object_pid_refs_file_exists(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.cid store.tag_object(pid, cid) - pid_refs_file_path = store.resolve_path("pid", pid) + pid_refs_file_path = store._resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) - cid_refs_file_path = store.resolve_path("cid", cid) + cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) with pytest.raises(FileExistsError): store.tag_object(pid, cid) @@ -50,7 +50,7 @@ def test_tag_object_pid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store.resolve_path("pid", pid) + pid_refs_file_path = store._resolve_path("pid", pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() assert pid_refs_cid == object_metadata.cid @@ -64,7 +64,7 @@ def test_tag_object_cid_refs_file(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.cid store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store.resolve_path("cid", cid) + cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) @@ -75,7 +75,7 @@ def test_tag_object_cid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store.resolve_path("cid", object_metadata.cid) + cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) with open(cid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read().strip() assert pid_refs_cid == pid @@ -93,7 +93,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): with pytest.raises(FileExistsError): store.tag_object(pid, another_cid) - second_cid_hash = store.resolve_path("cid", another_cid) + second_cid_hash = store._resolve_path("cid", another_cid) assert not os.path.exists(second_cid_hash) @@ -112,7 +112,7 @@ def test_tag_object_cid_refs_update_cid_refs_updated(store): store.tag_object(additional_pid, cid) # Read cid file to confirm cid refs file contains the additional pid - cid_ref_abs_path = store.resolve_path("cid", cid) + cid_ref_abs_path = store._resolve_path("cid", cid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -134,7 +134,7 @@ def test_tag_object_cid_refs_update_pid_refs_created(store): additional_pid = "dou.test.1" store.tag_object(additional_pid, cid) - pid_refs_file_path = store.resolve_path("pid", additional_pid) + pid_refs_file_path = store._resolve_path("pid", additional_pid) assert os.path.exists(pid_refs_file_path) @@ -149,11 +149,11 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): cid = object_metadata.cid # Manually update the cid refs, pid refs file missing at this point additional_pid = "dou.test.1" - cid_ref_abs_path = store.resolve_path("cid", cid) + cid_ref_abs_path = store._resolve_path("cid", cid) store._update_cid_refs(cid_ref_abs_path, additional_pid) # Confirm the pid refs file is missing - pid_refs_file_path = store.resolve_path("pid", additional_pid) + pid_refs_file_path = store._resolve_path("pid", additional_pid) assert not os.path.exists(pid_refs_file_path) # Call tag_object, this should create the missing pid refs file @@ -209,7 +209,7 @@ def test_verify_object_exception_incorrect_size(pids, store): cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store.resolve_path("cid", cid) + cid_abs_path = store._resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -231,7 +231,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store.resolve_path("cid", cid) + cid_abs_path = store._resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -249,7 +249,7 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): def test_write_cid_refs_file(store): """Test that write_cid_reference writes a reference file.""" - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "test_pid") assert os.path.exists(tmp_cid_refs_file) @@ -257,7 +257,7 @@ def test_write_cid_refs_file(store): def test_write_cid_refs_file_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) with open(tmp_cid_refs_file, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() @@ -268,7 +268,7 @@ def test_write_cid_refs_file_content(pids, store): def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) pid_other = "dou.test.1" store._update_cid_refs(tmp_cid_refs_file, pid_other) @@ -282,7 +282,7 @@ def test_update_cid_refs_content(pids, store): def test_update_cid_refs_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) cid_reference_list = [pid] @@ -304,7 +304,7 @@ def test_update_cid_refs_content_pid_exists(pids, store): """Test that update_cid_ref does not throw exception if pid already exists and proceeds to complete the tagging process (verify_object)""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) # Exception should not be thrown store._update_cid_refs(tmp_cid_refs_file, pid) @@ -314,7 +314,7 @@ def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - cid_ref_abs_path = store.resolve_path("cid", cid) + cid_ref_abs_path = store._resolve_path("cid", cid) with pytest.raises(FileNotFoundError): store._update_cid_refs(cid_ref_abs_path, pid) @@ -322,7 +322,7 @@ def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) pid_other = "dou.test.1" @@ -339,7 +339,7 @@ def test_delete_cid_refs_pid(pids, store): def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) # First remove the pid store._delete_cid_refs_pid(tmp_cid_refs_file, pid) @@ -352,7 +352,7 @@ def test_write_pid_refs_file(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) assert os.path.exists(tmp_pid_refs_file) @@ -361,7 +361,7 @@ def test_write_pid_refs_file_content(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) with open(tmp_pid_refs_file, "r", encoding="utf8") as f: pid_refs_cid = f.read() @@ -373,7 +373,7 @@ def test_delete_pid_refs_file(pids, store): """Test that delete_pid_refs_file deletes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) store._delete_pid_refs_file(tmp_pid_refs_file) @@ -383,7 +383,7 @@ def test_delete_pid_refs_file(pids, store): def test_delete_pid_refs_file_file_not_found(pids, store): """Test that delete_pid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): - pid_ref_abs_path = store.resolve_path("pid", pid) + pid_ref_abs_path = store._resolve_path("pid", pid) with pytest.raises(FileNotFoundError): store._delete_pid_refs_file(pid_ref_abs_path) @@ -401,15 +401,15 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] # Write the cid refs file and move it where it needs to be - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) - cid_ref_abs_path = store.resolve_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) + cid_ref_abs_path = store._resolve_path("cid", cid) + store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Write the pid refs file and move it where it needs to be with a bad cid - pid_ref_abs_path = store.resolve_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" + pid_ref_abs_path = store._resolve_path("pid", pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) @@ -421,9 +421,9 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when cid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - pid_ref_abs_path = store.resolve_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" + pid_ref_abs_path = store._resolve_path("pid", pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) @@ -437,15 +437,15 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] # Get a tmp cid refs file and write the wrong pid into it - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - cid_ref_abs_path = store.resolve_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) + cid_ref_abs_path = store._resolve_path("cid", cid) + store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs file, both cid and pid refs must be present - pid_ref_abs_path = store.resolve_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" + pid_ref_abs_path = store._resolve_path("pid", pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) shutil.move(tmp_pid_refs_file, pid_ref_abs_path) @@ -461,15 +461,15 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi for pid in pids.keys(): cid = pids[pid]["sha256"] # Write the wrong pid into a cid refs file and move it where it needs to be - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - cid_ref_abs_path = store.resolve_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) + cid_ref_abs_path = store._resolve_path("cid", cid) + store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs with expected values - pid_ref_abs_path = store.resolve_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" + pid_ref_abs_path = store._resolve_path("pid", pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) shutil.move(tmp_pid_refs_file, pid_ref_abs_path) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 3aee347a..3511bbe8 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -132,7 +132,7 @@ def test_store_object(store, pids): sys.argv = chs_args hashstoreclient.main() - assert store.exists("objects", pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_metadata(store, pids): @@ -164,7 +164,7 @@ def test_store_metadata(store, pids): sys.argv = chs_args hashstoreclient.main() - assert store.exists("metadata", pids[pid]["metadata_cid"]) + assert store._exists("metadata", pids[pid]["metadata_cid"]) def test_retrieve_objects(capsys, pids, store): @@ -272,7 +272,7 @@ def test_delete_objects(pids, store): sys.argv = chs_args hashstoreclient.main() - assert not store.exists("objects", pids[pid][store.algorithm]) + assert not store._exists("objects", pids[pid][store.algorithm]) def test_delete_metadata(pids, store): @@ -304,4 +304,4 @@ def test_delete_metadata(pids, store): sys.argv = chs_args hashstoreclient.main() - assert not store.exists("metadata", pids[pid]["metadata_cid"]) + assert not store._exists("metadata", pids[pid]["metadata_cid"]) From f040cda30b90bb37dd44274e80e820f671b2a375 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:53:18 -0800 Subject: [PATCH 143/420] Update inaccurate docstring in 'HashStoreClient' --- src/hashstore/hashstoreclient.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index a1457e46..d786682f 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -245,11 +245,11 @@ class HashStoreClient: MET_TYPE = "metadata" def __init__(self, properties, testflag=None): - """Store objects in a given directory into HashStore. + """Initialize the HashStoreClient with optional flag to test with the + test server at 'test.arcticdata.io' - :param str origin_dir: Directory to convert. - :param str obj_type: Type of objects ('object' or 'metadata'). - :param int num: Number of files to store. + :param dict properties: HashStore properties to initialize with + :param str testflag: 'knbvm' to denote testing on 'test.arcticdata.io' """ factory = HashStoreFactory() From 031d46831fc5d4e469bde231bd382577d11008f4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 11:10:08 -0800 Subject: [PATCH 144/420] Refactor '_load_properties' to be static and update pytests and affected code --- src/hashstore/filehashstore.py | 24 ++++++++++++++---------- tests/test_filehashstore.py | 8 ++++++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1a582f3f..cf6a77ef 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -126,7 +126,8 @@ def __init__(self, properties=None): # Configuration and Related Methods - def _load_properties(self): + @staticmethod + def _load_properties(hahstore_yaml_path, hashstore_required_prop_keys): """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): @@ -136,7 +137,7 @@ def _load_properties(self): - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. :rtype: dict """ - if not os.path.exists(self.hashstore_configuration_yaml): + if not os.path.exists(hahstore_yaml_path): exception_string = ( "FileHashStore - load_properties: hashstore.yaml not found" + " in store root path." @@ -145,14 +146,12 @@ def _load_properties(self): raise FileNotFoundError(exception_string) # Open file - with open( - self.hashstore_configuration_yaml, "r", encoding="utf-8" - ) as hs_yaml_file: + with open(hahstore_yaml_path, "r", encoding="utf-8") as hs_yaml_file: yaml_data = yaml.safe_load(hs_yaml_file) # Get hashstore properties hashstore_yaml_dict = {} - for key in self.property_required_keys: + for key in hashstore_required_prop_keys: if key != "store_path": hashstore_yaml_dict[key] = yaml_data[key] logging.debug( @@ -303,7 +302,9 @@ def _verify_hashstore_properties(self, properties, prop_store_path): self.hashstore_configuration_yaml, ) # If 'hashstore.yaml' is found, verify given properties before init - hashstore_yaml_dict = self._load_properties() + hashstore_yaml_dict = self._load_properties( + self.hashstore_configuration_yaml, self.property_required_keys + ) for key in self.property_required_keys: # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` if key != "store_path": @@ -1305,7 +1306,8 @@ def _is_pid_in_cid_refs_file(self, pid, cid_ref_abs_path): return True return False - def _update_cid_refs(self, cid_ref_abs_path, pid): + @staticmethod + def _update_cid_refs(cid_ref_abs_path, pid): """Update an existing CID reference file with the given PID. :param str cid_ref_abs_path: Absolute path to the CID reference file. @@ -1341,7 +1343,8 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): + @staticmethod + def _delete_cid_refs_pid(cid_ref_abs_path, pid): """Delete a PID from a CID reference file. :param str cid_ref_abs_path: Absolute path to the CID reference file. @@ -1645,7 +1648,8 @@ def _verify_hashstore_references(self, pid, cid, verify_type): logging.error(exception_string) raise ValueError(exception_string) - def _check_arg_data(self, data): + @staticmethod + def _check_arg_data(data): """Checks a data argument to ensure that it is either a string, path, or stream object. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 2aafec81..c11726ef 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -135,7 +135,9 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): def test_load_properties(store): """Verify dictionary returned from _load_properties matches initialization.""" - hashstore_yaml_dict = store._load_properties() + hashstore_yaml_dict = store._load_properties( + store.hashstore_configuration_yaml, store.property_required_keys + ) assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" @@ -149,7 +151,9 @@ def test_load_properties_hashstore_yaml_missing(store): """Confirm FileNotFoundError is raised when hashstore.yaml does not exist.""" os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): - store._load_properties() + store._load_properties( + store.hashstore_configuration_yaml, store.property_required_keys + ) def test_validate_properties(store): From 20a1c32154109576dcf048566af02df2cf5f2994 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 31 Jan 2024 13:59:43 -0800 Subject: [PATCH 145/420] Add new parameter 'id_type' to 'delete_object' method --- src/hashstore/filehashstore.py | 141 +++++++++++++++++---------------- 1 file changed, 72 insertions(+), 69 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index cf6a77ef..1cefbb24 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -770,80 +770,83 @@ def retrieve_metadata(self, pid, format_id=None): ) return metadata_stream - def delete_object(self, pid): + def delete_object(self, pid, id_type=None): logging.debug( "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) self._check_string(pid, "pid", "delete_object") - try: - cid = self.find_object(pid) - except FileNotFoundError as fnfe: - if "pid refs file not found" in fnfe: - # Nothing to delete - return - if "cid refs file not found" in fnfe: - # Delete pid refs file - pid_ref_abs_path = self._resolve_path("pid", pid) - self._delete("pid", pid_ref_abs_path) - return - if "object referenced does not exist" in fnfe: - # Delete pid refs file - pid_ref_abs_path = self._resolve_path("pid", pid) - self._delete("pid", pid_ref_abs_path) - return - except ValueError as ve: - if "is missing from cid refs file" in ve: - # Delete pid refs file - pid_ref_abs_path = self._resolve_path("pid", pid) - self._delete("pid", pid_ref_abs_path) - return - - # Proceed with next steps - cid has been retrieved without any errors - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", - cid, - ) - time.sleep(self.time_out_sec) - # Modify reference_locked_cids consecutively - with self.reference_lock: - logging.debug( - "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", - cid, - ) - self.reference_locked_cids.append(cid) - try: - cid_ref_abs_path = self._resolve_path("cid", cid) - pid_ref_abs_path = self._resolve_path("pid", pid) - # First delete the pid refs file immediately - self._delete_pid_refs_file(pid_ref_abs_path) - # Remove pid from cid reference file - self._delete_cid_refs_pid(cid_ref_abs_path, pid) - # Delete cid reference file and object only if the cid refs file is empty - if os.path.getsize(cid_ref_abs_path) == 0: - self._delete("cid", cid_ref_abs_path) - self._delete("objects", cid) - info_string = ( - "FileHashStore - delete_object: Successfully deleted references and" - + f" object associated with pid: {pid}" - ) - logging.info(info_string) - else: - info_string = ( - "FileHashStore - delete_object: Successfully deleted pid refs file but" - + f" not object with cid ({cid}), cid refs file not empty." - ) - logging.info(info_string) + if id_type is "cid": + # TODO: Delete object only return True - - finally: - # Release cid + else: + # id_type is "pid" + try: + cid = self.find_object(pid) + except FileNotFoundError as fnfe: + if "pid refs file not found" in fnfe: + # Nothing to delete + return + if "cid refs file not found" in fnfe: + # Delete pid refs file + self._delete("pid", pid) + return + if "object referenced does not exist" in fnfe: + # Delete pid refs file + self._delete("pid", pid) + return + except ValueError as ve: + if "is missing from cid refs file" in ve: + # Delete pid refs file + self._delete("pid", pid) + return + + # Proceed with next steps - cid has been retrieved without any errors + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively with self.reference_lock: logging.debug( - "FileHashStore - delete_object: Removing cid: %s from reference_locked_cids.", + "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", cid, ) - self.reference_locked_cids.remove(cid) + self.reference_locked_cids.append(cid) + try: + cid_ref_abs_path = self._resolve_path("cid", cid) + pid_ref_abs_path = self._resolve_path("pid", pid) + # First delete the pid refs file immediately + self._delete_pid_refs_file(pid_ref_abs_path) + # Remove pid from cid reference file + self._delete_cid_refs_pid(cid_ref_abs_path, pid) + # Delete cid reference file and object only if the cid refs file is empty + if os.path.getsize(cid_ref_abs_path) == 0: + self._delete("cid", cid_ref_abs_path) + self._delete("objects", cid) + info_string = ( + "FileHashStore - delete_object: Successfully deleted references and" + + f" object associated with pid: {pid}" + ) + logging.info(info_string) + else: + info_string = ( + "FileHashStore - delete_object: Successfully deleted pid refs file but" + + f" not object with cid ({cid}), cid refs file not empty." + ) + logging.info(info_string) + return True + + finally: + # Release cid + with self.reference_lock: + debug_msg = ( + "FileHashStore - delete_object:" + + f" Removing cid: {cid} from reference_locked_cids." + ) + logging.debug(debug_msg) + self.reference_locked_cids.remove(cid) def delete_metadata(self, pid, format_id=None): logging.debug( @@ -1993,13 +1996,13 @@ def _resolve_path(self, entity, file): # Check for sharded path. if entity == "cid": # Note, we skip checking whether the file exists for refs - ref_file_abs_path = self._build_path(entity, file) - return ref_file_abs_path + cid_ref_file_abs_path = self._build_path(entity, file) + return cid_ref_file_abs_path elif entity == "pid": # Note, we skip checking whether the file exists for refs hash_id = self._computehash(file, self.algorithm) - ref_file_abs_path = self._build_path(entity, hash_id) - return ref_file_abs_path + pid_ref_file_abs_path = self._build_path(entity, hash_id) + return pid_ref_file_abs_path else: abspath = self._build_path(entity, file) if os.path.isfile(abspath): From 93b65260c660e6f0e37d0d9436a67c28e423b6a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 31 Jan 2024 14:27:59 -0800 Subject: [PATCH 146/420] Update HashStore interface docstring for 'delete_object', implement 'delete_object' for when 'id_type' is 'cid' and add new pytests --- src/hashstore/filehashstore.py | 15 +++++++++----- src/hashstore/hashstore.py | 15 +++++++------- tests/test_filehashstore_interface.py | 30 ++++++++++++++++++++++++++- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1cefbb24..a59bf56f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -770,16 +770,21 @@ def retrieve_metadata(self, pid, format_id=None): ) return metadata_stream - def delete_object(self, pid, id_type=None): + def delete_object(self, ab_id, id_type=None): logging.debug( - "FileHashStore - delete_object: Request to delete object for pid: %s", pid + "FileHashStore - delete_object: Request to delete object for id: %s", ab_id ) - self._check_string(pid, "pid", "delete_object") + self._check_string(ab_id, "ab_id", "delete_object") if id_type is "cid": - # TODO: Delete object only - return True + cid_refs_abs_path = self._resolve_path("objects", ab_id) + if os.path.exists(cid_refs_abs_path): + self._delete("objects", ab_id) + return True + else: + return False else: # id_type is "pid" + pid = ab_id try: cid = self.find_object(pid) except FileNotFoundError as fnfe: diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index c5019825..5bfabde2 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -162,13 +162,14 @@ def retrieve_metadata(self, pid, format_id): raise NotImplementedError() @abstractmethod - def delete_object(self, pid): - """Delete an object permanently from disk using a persistent identifier (pid). - - The `delete_object` method removes the object associated with the provided `pid` from - disk, resulting in the permanent deletion of the object. - - :param str pid: Authority-based identifier. + def delete_object(self, ab_id, id_type): + """Delete an object and its related data permanently from disk using a given identifier + and 'id_type'. When 'id_type' is 'pid', HashStore will attempt to delete the object and + its associated references files. When 'id_type' is 'cid', HashStore will only attempt to + delete the object. If 'id_type' is not supplied, `delete_object` will assume it is 'pid'. + + :param str ab_id: Authority-based identifier. + :param str id_type: "pid" or "Cid :return: bool - `True` upon successful deletion. """ diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 40dc99f0..9405e62e 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -42,6 +42,21 @@ def test_store_object(pids, store): object_metadata = store.store_object(pid, path) assert object_metadata.cid == pids[pid][store.algorithm] assert store._count(entity) == 3 + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + +def test_store_object_only_object(pids, store): + """Test store object stores an object only (no reference files will be created)""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(data=path) + assert object_metadata.cid == pids[pid][store.algorithm] + assert store._count(entity) == 3 + assert store._count("pid") == 0 + assert store._count("cid") == 0 def test_store_object_files_path(pids, store): @@ -882,7 +897,7 @@ def test_retrieve_metadata_format_id_empty_spaces(store): def test_delete_object(pids, store): - """Test delete_object successfully deletes objects from /objects.""" + """Test delete_object successfully deletes objects from /objects and all refs files.""" test_dir = "tests/testdata/" entity = "objects" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -894,6 +909,8 @@ def test_delete_object(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store._count(entity) == 0 + assert store._count("pid") == 0 + assert store._count("cid") == 0 def test_delete_object_pid_refs_file(pids, store): @@ -942,6 +959,17 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): assert os.path.exists(cid_refs_file_path) +def test_delete_object_id_type_cid(pids, store): + """Test delete_object successfully deletes only object.""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid=None, data=path) + store.delete_object(object_metadata.cid, "cid") + assert store._count(entity) == 0 + + def test_delete_object_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" pid = " " From 4480fba7e9355126e494bf914cd08e31d6928750 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 31 Jan 2024 14:41:01 -0800 Subject: [PATCH 147/420] Update README.md for updated 'delete_object' functionality and add TODO items --- README.md | 4 +++- src/hashstore/filehashstore.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b864c157..3f5681ec 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,9 @@ tag_object(pid, cid) - If desired, this cid can then be used to locate the object on disk by following HashStore's store configuration. **How do I delete an object if I have the pid?** -- To delete an object, call the Public API method `delete_object` which will delete the object and its associated references and reference files where relevant. +- To delete an object and all its associated reference files, call the Public API method `delete_object` with `id_type` 'pid'. +- To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it it is not referenced by any pids. +- To delete an object and all its related data (reference files and system metadata), call the Public API method `delete_object` with `id_type` 'clear'. - Note, `delete_object` and `tag_object` calls are synchronized on their content identifier values so that the shared reference files are not unintentionally modified concurrently. An object that is in the process of being deleted should not be tagged, and vice versa. These calls have been implemented to occur sequentially to improve clarity in the event of an unexpected conflict or issue. diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a59bf56f..e7952450 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -701,6 +701,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) + # TODO: Refactor the way we store metadata and add pytests to confirm metadata_cid = self._put_metadata(metadata, pid, checked_format_id) logging.info( @@ -841,6 +842,7 @@ def delete_object(self, ab_id, id_type=None): + f" not object with cid ({cid}), cid refs file not empty." ) logging.info(info_string) + # TODO:: Check 'id_type' for 'clear' & attempt to remove all metadata docs if so return True finally: @@ -1264,6 +1266,8 @@ def delete_tmp_file(): os.umask(oldmask) return tmp + # TODO: Clean up refs file methods, a lot of redundant code + def _write_cid_refs_file(self, path, pid): """Write the CID reference file in the supplied path to a file. A reference file contains every PID that references a CID, each on its own line. This method will From d26b1c32ad0647e35f9875c9a3e2516cf2d279a4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 5 Feb 2024 16:31:11 -0800 Subject: [PATCH 148/420] Update 'ObjectMetadata' class with new attribute 'pid' and revise affected code and docstrings --- src/hashstore/filehashstore.py | 6 ++++-- src/hashstore/hashstore.py | 15 +++++++++------ tests/test_hashstore.py | 4 +++- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e7952450..2aa4cf48 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -948,7 +948,9 @@ def _store_and_validate_data( file_size_to_validate, ) - object_metadata = ObjectMetadata(object_cid, obj_file_size, hex_digest_dict) + object_metadata = ObjectMetadata( + pid, object_cid, obj_file_size, hex_digest_dict + ) logging.debug( "FileHashStore - put_object: Successfully put object for pid: %s", pid, @@ -985,7 +987,7 @@ def _store_data_only(self, data): ) = self._move_and_get_checksums(None, stream) object_metadata = ObjectMetadata( - object_ref_pid_location, obj_file_size, hex_digest_dict + None, object_ref_pid_location, obj_file_size, hex_digest_dict ) # The permanent address of the data stored is based on the data's checksum cid = hex_digest_dict.get(self.algorithm) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 5bfabde2..0236c453 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -255,12 +255,15 @@ def get_hashstore(module_name, class_name, properties=None): ) -class ObjectMetadata(namedtuple("ObjectMetadata", ["cid", "obj_size", "hex_digests"])): +class ObjectMetadata( + namedtuple("ObjectMetadata", ["pid", "cid", "obj_size", "hex_digests"]) +): """Represents metadata associated with an object. - The `ObjectMetadata` class represents metadata associated with an object, - including a content identifier (`cid`), the size of the object in bytes (`obj_size`), - and an optional list of hex digests (`hex_digests`) to assist with validating objects. + The `ObjectMetadata` class represents metadata associated with an object, including + a persistent or authority-based identifier (`pid`), a content identifier (`cid`), + the size of the object in bytes (`obj_size`), and an optional list of hex digests + (`hex_digests`) to assist with validating objects. :param str cid: A unique identifier for the object (Hash ID, hex digest). :param bytes obj_size: The size of the object in bytes. @@ -269,5 +272,5 @@ class ObjectMetadata(namedtuple("ObjectMetadata", ["cid", "obj_size", "hex_diges """ # Default value to prevent dangerous default value - def __new__(cls, cid, obj_size, hex_digests=None): - return super(ObjectMetadata, cls).__new__(cls, cid, obj_size, hex_digests) + def __new__(cls, pid, cid, obj_size, hex_digests=None): + return super(ObjectMetadata, cls).__new__(cls, pid, cid, obj_size, hex_digests) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index a2d42398..eb79caa8 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -77,6 +77,7 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) def test_objectmetadata(): """Test ObjectMetadata class returns correct values via dot notation.""" + pid = "hashstore" ab_id = "hashstoretest" obj_size = 1234 hex_digest_dict = { @@ -86,7 +87,8 @@ def test_objectmetadata(): "sha256": "sha256value", "sha512": "sha512value", } - object_metadata = ObjectMetadata(ab_id, obj_size, hex_digest_dict) + object_metadata = ObjectMetadata(pid, ab_id, obj_size, hex_digest_dict) + assert object_metadata.pid == pid assert object_metadata.cid == ab_id assert object_metadata.obj_size == obj_size assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] From 371afd2a875019dcbefb1902f7fb358639ef1984 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 5 Feb 2024 16:41:54 -0800 Subject: [PATCH 149/420] Update HashStore interface docstrings --- src/hashstore/filehashstore.py | 2 +- src/hashstore/hashstore.py | 76 ++++++++++++++-------------------- 2 files changed, 33 insertions(+), 45 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2aa4cf48..b9b6012e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -842,7 +842,7 @@ def delete_object(self, ab_id, id_type=None): + f" not object with cid ({cid}), cid refs file not empty." ) logging.info(info_string) - # TODO:: Check 'id_type' for 'clear' & attempt to remove all metadata docs if so + # TODO: Check 'id_type' for 'clear' & attempt to remove all metadata docs if so return True finally: diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 0236c453..068f9039 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -25,13 +25,12 @@ def store_object( checksum_algorithm, expected_object_size, ): - """Atomic storage of objects to disk using a given stream. - - The `store_object` method ensures atomic storage of objects to disk. Upon successful - storage, it returns an ObjectMetadata object containing relevant file information, - such as the file's id (used to locate the object on disk), the file's size, and a hex digest - dictionary of algorithms and checksums. The method also tags the object, creating references - for discoverability. + """Atomic storage of objects to disk using a given stream. The `store_object` method + ensures atomic storage of objects to disk. Upon successful storage, it returns an + `ObjectMetadata` object containing relevant file information, such as the file's id + (used to locate the object on disk), the file's size, and a hex digest dictionary of + algorithms and checksums. The method also tags the object, creating references for + discoverability. `store_object` ensures that an object is stored only once by synchronizing multiple calls and rejecting attempts to store duplicate objects. If called without a pid, it stores the @@ -68,11 +67,9 @@ def store_object( @abstractmethod def tag_object(self, pid, cid): - """Create references to make objects discoverable in HashStore. - - The `tag_object` method enables operations such as retrieving, deleting, or calculating - a hex digest based on the provided pid argument. To perform these actions, it's crucial - to locate the object associated with the given pid. + """Creates references that allow objects stored in HashStore to be discoverable. Retrieving, + deleting or calculating a hex digest of an object is based on a pid argument; and to + proceed, we must be able to find the object associated with the pid. :param str pid: Authority-based or persistent identifier of the object. :param str cid: Content identifier of the object. @@ -85,10 +82,8 @@ def tag_object(self, pid, cid): def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - """Confirm equality of content in an ObjectMetadata. - - The `verify_object` method verifies that the content in the provided `object_metadata` - matches the specified values. + """Confirm equality of content in an ObjectMetadata. The `verify_object` method verifies + that the content in the provided `object_metadata` matches the specified values. :param ObjectMetadata object_metadata: ObjectMetadata object. :param str checksum: Value of the checksum. @@ -102,7 +97,6 @@ def verify_object( @abstractmethod def find_object(self, pid): """Check if an object referenced by a pid exists and retrieve its content identifier. - The `find_object` method validates the existence of an object based on the provided pid and returns the associated content identifier. @@ -114,11 +108,12 @@ def find_object(self, pid): @abstractmethod def store_metadata(self, pid, metadata, format_id): - """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. - - The `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` - to determine the permanent address of the metadata object. The permanent address is - calculated by obtaining the SHA-256 hex digest of the concatenation of `pid` & `format_id`. + """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. The + `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` + to determine the permanent address of the metadata object. All metadata documents for a + given `pid` will be stored in a directory (under ../metadata) that is determined by + calculating the hash of the given pid, with the document name being the hash of the + metadata format (`format_id`). Upon successful storage of metadata, the method returns a string representing the file's permanent address. Metadata objects are stored in parallel to objects in the @@ -134,10 +129,9 @@ def store_metadata(self, pid, metadata, format_id): @abstractmethod def retrieve_object(self, pid): - """Retrieve an object from disk using a persistent identifier (pid). - - The `retrieve_object` method opens and returns a buffered object stream ready for reading - if the object associated with the provided `pid` exists on disk. + """Retrieve an object from disk using a persistent identifier (pid). The `retrieve_object` + method opens and returns a buffered object stream ready for reading if the object + associated with the provided `pid` exists on disk. :param str pid: Authority-based identifier. @@ -148,11 +142,8 @@ def retrieve_object(self, pid): @abstractmethod def retrieve_metadata(self, pid, format_id): """Retrieve the metadata object from disk using a persistent identifier (pid) - and metadata namespace (format_id). - - The `retrieve_metadata` method calculates the metadata object's permanent address - by hashing the concatenation of the given `pid` and `format_id`. If the object - exists, the method opens and returns a buffered metadata stream ready for reading. + and metadata namespace (format_id). If the metadata document exists, the method opens + and returns a buffered metadata stream ready for reading. :param str pid: Authority-based identifier. :param str format_id: Metadata format. @@ -163,10 +154,11 @@ def retrieve_metadata(self, pid, format_id): @abstractmethod def delete_object(self, ab_id, id_type): - """Delete an object and its related data permanently from disk using a given identifier - and 'id_type'. When 'id_type' is 'pid', HashStore will attempt to delete the object and - its associated references files. When 'id_type' is 'cid', HashStore will only attempt to - delete the object. If 'id_type' is not supplied, `delete_object` will assume it is 'pid'. + """Deletes an object and its related data permanently from HashStore using a given + persistent identifier. If the `id_type` is 'pid', the object associated with the pid will + be deleted if it is not referenced by any other pids, along with its reference files and + all metadata documents found in its respective metadata directory. If the `id_type` is + 'cid', only the object will be deleted if it is not referenced by other pids. :param str ab_id: Authority-based identifier. :param str id_type: "pid" or "Cid @@ -177,11 +169,9 @@ def delete_object(self, ab_id, id_type): @abstractmethod def delete_metadata(self, pid, format_id): - """Delete a metadata document permanently from disk using a persistent identifier (pid) - and metadata namespace (format_id). - - The `delete_metadata` method removes the metadata document associated with the provided - `pid` and `format_id` from disk, resulting in its permanent deletion. + """Deletes a metadata document (ex. `sysmeta`) permanently from HashStore using a given + persistent identifier and its respective metadata namespace. If a `format_id` is supplied, + only the metadata document associated with the `format_id` will be deleted. :param str pid: Authority-based identifier. :param str format_id: Metadata format. @@ -192,10 +182,8 @@ def delete_metadata(self, pid, format_id): @abstractmethod def get_hex_digest(self, pid, algorithm): - """Calculate the hex digest of an object in HashStore. - - The `get_hex_digest` method calculates the hex digest of an object that exists - in HashStore using a given persistent identifier and hash algorithm. + """Calculates the hex digest of an object that exists in HashStore using a given persistent + identifier and hash algorithm. :param str pid: Authority-based identifier. :param str algorithm: Algorithm of hex digest to generate. From 826ca4deb24c9154fe29047838a46052b9f4b0d6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 5 Feb 2024 18:16:11 -0800 Subject: [PATCH 150/420] Refactor 'store_metadata' to store documents under a directory formed by sharding the hash of the 'pid', with metadata doc name being the hash of the 'format_id' and update related pytests --- src/hashstore/filehashstore.py | 9 +++++---- tests/test_filehashstore.py | 19 ++++++++++++++---- tests/test_filehashstore_interface.py | 29 ++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b9b6012e..acab3bb1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1472,9 +1472,10 @@ def _put_metadata(self, metadata, pid, format_id): metadata_tmp = self._mktmpmetadata(metadata_stream) # Get target and related paths (permanent location) - metadata_cid = self._computehash(pid + format_id) - rel_path = "/".join(self._shard(metadata_cid)) - full_path = self._get_store_path("metadata") / rel_path + metadata_directory = self._computehash(pid) + metadata_document_name = self._computehash(format_id) + rel_path = "/".join(self._shard(metadata_directory)) + full_path = self._get_store_path("metadata") / rel_path / metadata_document_name # Move metadata to target path if os.path.exists(metadata_tmp): @@ -1487,7 +1488,7 @@ def _put_metadata(self, metadata, pid, format_id): "FileHashStore - _put_metadata: Successfully put metadata for pid: %s", pid, ) - return metadata_cid + return full_path except Exception as err: exception_string = ( f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index c11726ef..14b6adf0 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -650,7 +650,15 @@ def test_put_metadata_cid(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store._put_metadata(syspath, pid, format_id) - assert metadata_cid == pids[pid]["metadata_cid"] + + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_cid == full_path def test_mktmpmetadata(pids, store): @@ -817,7 +825,7 @@ def test_get_store_path_refs(store): assert path_metadata_string.endswith("/metacat/refs") -def test_exists_with_object_metadata_id(pids, store): +def test_exists_object_with_object_metadata_id(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" entity = "objects" @@ -827,7 +835,7 @@ def test_exists_with_object_metadata_id(pids, store): assert store._exists(entity, object_metadata.cid) -def test_exists_with_sharded_path(pids, store): +def test_exists_object_with_sharded_path(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" entity = "objects" @@ -839,7 +847,10 @@ def test_exists_with_sharded_path(pids, store): assert store._exists(entity, object_metadata_shard_path) -def test_exists_with_nonexistent_file(store): +# TODO: Test exists for metadata + + +def test_exists_object_with_nonexistent_file(store): """Test exists method with a nonexistent file.""" entity = "objects" non_existent_file = "tests/testdata/filedoesnotexist" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 9405e62e..293fe264 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -654,17 +654,32 @@ def test_store_metadata(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - assert metadata_cid == pids[pid]["metadata_cid"] + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_cid == full_path def test_store_metadata_default_format_id(pids, store): """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath) - assert metadata_cid == pids[pid]["metadata_cid"] + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_cid == full_path def test_store_metadata_files_path(pids, store): @@ -676,8 +691,16 @@ def test_store_metadata_files_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) + # TODO: Ensure exists works as expected for metadata assert store._exists(entity, metadata_cid) - assert metadata_cid == pids[pid]["metadata_cid"] + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_cid == full_path assert store._count(entity) == 3 From 07cb891d14357cfe98725aad1edba51c31355595 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 09:49:22 -0800 Subject: [PATCH 151/420] Update HashStore client module and revise pytests --- src/hashstore/hashstoreclient.py | 2 +- tests/test_filehashstore.py | 11 +++++++++- tests/test_filehashstore_interface.py | 29 +++++---------------------- tests/test_hashstore_client.py | 17 ++++++++++++++-- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index d786682f..c4e26474 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -850,7 +850,7 @@ def main(): raise ValueError("'-path' option is required") # Store metadata to HashStore metadata_cid = hashstore_c.hashstore.store_metadata(pid, path, formatid) - print(f"Metadata ID: {metadata_cid}") + print(f"Metadata Path: {metadata_cid}") elif getattr(args, "client_retrieveobject"): if pid is None: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 14b6adf0..05afb37d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -847,7 +847,16 @@ def test_exists_object_with_sharded_path(pids, store): assert store._exists(entity, object_metadata_shard_path) -# TODO: Test exists for metadata +def test_exists_metadata_files_path(pids, store): + """Test exists works as expected for metadata.""" + test_dir = "tests/testdata/" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_cid = store.store_metadata(pid, syspath, format_id) + assert store._exists(entity, metadata_cid) def test_exists_object_with_nonexistent_file(store): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 293fe264..0f1df1cf 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -649,6 +649,7 @@ def test_find_object_pid_empty(store): def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" + entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" @@ -662,6 +663,7 @@ def test_store_metadata(pids, store): store._get_store_path("metadata") / rel_path / metadata_document_name ) assert metadata_cid == full_path + assert store._count(entity) == 3 def test_store_metadata_default_format_id(pids, store): @@ -682,28 +684,6 @@ def test_store_metadata_default_format_id(pids, store): assert metadata_cid == full_path -def test_store_metadata_files_path(pids, store): - """Test store metadata with path.""" - test_dir = "tests/testdata/" - entity = "metadata" - format_id = "http://ns.dataone.org/service/types/v2.0" - for pid in pids.keys(): - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) - # TODO: Ensure exists works as expected for metadata - assert store._exists(entity, metadata_cid) - # Manually calculate expected path - metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(format_id) - rel_path = "/".join(store._shard(metadata_directory)) - full_path = ( - store._get_store_path("metadata") / rel_path / metadata_document_name - ) - assert metadata_cid == full_path - assert store._count(entity) == 3 - - def test_store_metadata_files_string(pids, store): """Test store metadata with string.""" test_dir = "tests/testdata/" @@ -782,7 +762,7 @@ def test_store_metadata_metadata_none(store): store.store_metadata(pid, syspath_string, format_id) -def test_store_metadata_metadata_cid(pids, store): +def test_store_metadata_metadata_path(pids, store): """Test store metadata returns expected metadata_cid.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -792,7 +772,8 @@ def test_store_metadata_metadata_cid(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) - assert metadata_cid == pids[pid]["metadata_cid"] + metadata_path = store._resolve_path("metadata", metadata_cid) + assert metadata_cid == metadata_path def test_store_metadata_thread_lock(store): diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 3511bbe8..be4e5f7a 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -4,6 +4,8 @@ from pathlib import Path from hashstore import hashstoreclient +# pylint: disable=W0212 + def test_create_hashstore(tmp_path): """Test creating a HashStore through the client.""" @@ -135,11 +137,12 @@ def test_store_object(store, pids): assert store._exists("objects", pids[pid][store.algorithm]) -def test_store_metadata(store, pids): +def test_store_metadata(capsys, store, pids): """Test storing metadata to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" namespace = "http://ns.dataone.org/service/types/v2.0" + entity = "metadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -164,7 +167,17 @@ def test_store_metadata(store, pids): sys.argv = chs_args hashstoreclient.main() - assert store._exists("metadata", pids[pid]["metadata_cid"]) + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(namespace) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + capsystext = capsys.readouterr().out + expected_output = f"Metadata Path: {full_path}\n" + assert capsystext == expected_output + + assert store._count(entity) == 3 def test_retrieve_objects(capsys, pids, store): From 89fd718482b08fd4f5539c4705e6acaa89344ab2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 10:12:10 -0800 Subject: [PATCH 152/420] Refactor 'retrieve_metadata' & 'delete_metadata' and add TODO item --- src/hashstore/filehashstore.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index acab3bb1..1cfebe39 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -755,10 +755,17 @@ def retrieve_metadata(self, pid, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" - metadata_cid = self._computehash(pid + checked_format_id) - metadata_exists = self._exists(entity, metadata_cid) + metadata_directory = self._computehash(pid) + if format_id is None: + metadata_document_name = self._computehash(self.sysmeta_ns) + else: + metadata_document_name = self._computehash(checked_format_id) + rel_path = "/".join(self._shard(metadata_directory)) + full_path_without_directory = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, full_path_without_directory) + if metadata_exists: - metadata_stream = self._open(entity, metadata_cid) + metadata_stream = self._open(entity, full_path_without_directory) else: exception_string = ( f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" @@ -862,10 +869,19 @@ def delete_metadata(self, pid, format_id=None): ) self._check_string(pid, "pid", "delete_metadata") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") - + # TODO: Delete all metadata related to the given pid when format_id is None entity = "metadata" - metadata_cid = self._computehash(pid + checked_format_id) - self._delete(entity, metadata_cid) + metadata_directory = self._computehash(pid) + if format_id is None: + metadata_document_name = self._computehash(self.sysmeta_ns) + else: + metadata_document_name = self._computehash(checked_format_id) + rel_path = "/".join(self._shard(metadata_directory)) + full_path_without_directory = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, full_path_without_directory) + + if metadata_exists: + self._delete(entity, full_path_without_directory) logging.info( "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", @@ -1982,7 +1998,8 @@ def _build_path(self, entity, hash_id, extension=""): def _resolve_path(self, entity, file): """Attempt to determine the absolute path of a file ID or path through - successive checking of candidate paths. + successive checking of candidate paths - first by checking whether the 'file' + exists, followed by checking the entity type with respect to the file. :param str entity: Desired entity type ("objects", "metadata", "cid", "pid"), where "cid" & "pid" represents resolving the path to the refs files. From bc0560ca1c558f65f589edb2cb3990f2c2b5fea2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 11:05:10 -0800 Subject: [PATCH 153/420] Refactor 'delete_metadata' to delete all metadata documents for a given pid if no 'format_id' is supplied, and add new pytests --- src/hashstore/filehashstore.py | 45 ++++++++++++++++++--------- tests/test_filehashstore_interface.py | 45 +++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 14 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1cfebe39..14a45f18 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -869,25 +869,42 @@ def delete_metadata(self, pid, format_id=None): ) self._check_string(pid, "pid", "delete_metadata") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") - # TODO: Delete all metadata related to the given pid when format_id is None + # Get the metadata directory path for the given pid entity = "metadata" metadata_directory = self._computehash(pid) + rel_path = "/".join(self._shard(metadata_directory)) + metadata_rel_path = self._get_store_path("metadata") / rel_path if format_id is None: - metadata_document_name = self._computehash(self.sysmeta_ns) + # Delete all metadata files + metadata_files = os.listdir(metadata_rel_path) + metadata_file_paths = [ + metadata_rel_path / file + for file in metadata_files + if os.path.isfile(metadata_rel_path / file) + ] + for file_path in metadata_file_paths: + os.remove(file_path) + + info_string = ( + "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", + pid, + ) + logging.info(info_string) + return True else: + # Delete a specific metadata file metadata_document_name = self._computehash(checked_format_id) - rel_path = "/".join(self._shard(metadata_directory)) - full_path_without_directory = rel_path + "/" + metadata_document_name - metadata_exists = self._exists(entity, full_path_without_directory) - - if metadata_exists: - self._delete(entity, full_path_without_directory) - - logging.info( - "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", - pid, - ) - return True + full_path_without_directory = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, full_path_without_directory) + if metadata_exists: + self._delete(entity, full_path_without_directory) + + info_string = ( + "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" + + f" {pid} for format_id: {format_id}" + ) + logging.info(info_string) + return True def get_hex_digest(self, pid, algorithm): logging.debug( diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 0f1df1cf..a71a8289 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -666,6 +666,33 @@ def test_store_metadata(pids, store): assert store._count(entity) == 3 +def test_store_metadata_one_pid_multiple_metadata_documents(store): + """Test store metadata for a pid with multiple metadata documents.""" + test_dir = "tests/testdata/" + entity = "metadata" + pid = "jtao.1700.1" + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_directory = store._computehash(pid) + rel_path = "/".join(store._shard(metadata_directory)) + format_id = "http://ns.dataone.org/service/types/v2.0" + format_id3 = "http://ns.dataone.org/service/types/v3.0" + format_id4 = "http://ns.dataone.org/service/types/v4.0" + metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid3 = store.store_metadata(pid, syspath, format_id3) + metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + metadata_document_name = store._computehash(format_id) + metadata_document_name3 = store._computehash(format_id3) + metadata_document_name4 = store._computehash(format_id4) + full_path = store._get_store_path("metadata") / rel_path / metadata_document_name + full_path3 = store._get_store_path("metadata") / rel_path / metadata_document_name3 + full_path4 = store._get_store_path("metadata") / rel_path / metadata_document_name4 + assert metadata_cid == full_path + assert metadata_cid3 == full_path3 + assert metadata_cid4 == full_path4 + assert store._count(entity) == 3 + + def test_store_metadata_default_format_id(pids, store): """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" @@ -1004,6 +1031,24 @@ def test_delete_metadata(pids, store): assert store._count(entity) == 0 +def test_delete_metadata_one_pid_multiple_metadata_documents(store): + """Test delete_metadata for a pid with multiple metadata documents deletes + all metadata files as expected.""" + test_dir = "tests/testdata/" + entity = "metadata" + pid = "jtao.1700.1" + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + format_id = "http://ns.dataone.org/service/types/v2.0" + format_id3 = "http://ns.dataone.org/service/types/v3.0" + format_id4 = "http://ns.dataone.org/service/types/v4.0" + _metadata_cid = store.store_metadata(pid, syspath, format_id) + _metadata_cid3 = store.store_metadata(pid, syspath, format_id3) + _metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + store.delete_metadata(pid) + assert store._count(entity) == 0 + + def test_delete_metadata_does_not_exist(pids, store): """Test delete_metadata does not throw exception when called to delete metadata that does not exist.""" From 278314da8bb4b1039227571572741fa8d8afb698 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 12:22:52 -0800 Subject: [PATCH 154/420] Update HashStore interface 'delete_object' and 'delete_metadata' docstrings, remove boolean return values and update pytests --- src/hashstore/filehashstore.py | 16 +++++++--------- src/hashstore/hashstore.py | 4 ---- tests/test_filehashstore_interface.py | 25 ++++++++++++++++++++----- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 14a45f18..a50655aa 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -784,12 +784,11 @@ def delete_object(self, ab_id, id_type=None): ) self._check_string(ab_id, "ab_id", "delete_object") if id_type is "cid": - cid_refs_abs_path = self._resolve_path("objects", ab_id) - if os.path.exists(cid_refs_abs_path): + cid_refs_abs_path = self._resolve_path("cid", ab_id) + # If the refs file still exists, do not delete the object + if not os.path.exists(cid_refs_abs_path): self._delete("objects", ab_id) - return True - else: - return False + return else: # id_type is "pid" pid = ab_id @@ -849,8 +848,7 @@ def delete_object(self, ab_id, id_type=None): + f" not object with cid ({cid}), cid refs file not empty." ) logging.info(info_string) - # TODO: Check 'id_type' for 'clear' & attempt to remove all metadata docs if so - return True + return finally: # Release cid @@ -890,7 +888,7 @@ def delete_metadata(self, pid, format_id=None): pid, ) logging.info(info_string) - return True + return else: # Delete a specific metadata file metadata_document_name = self._computehash(checked_format_id) @@ -904,7 +902,7 @@ def delete_metadata(self, pid, format_id=None): + f" {pid} for format_id: {format_id}" ) logging.info(info_string) - return True + return def get_hex_digest(self, pid, algorithm): logging.debug( diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 068f9039..1e6e50c5 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -162,8 +162,6 @@ def delete_object(self, ab_id, id_type): :param str ab_id: Authority-based identifier. :param str id_type: "pid" or "Cid - - :return: bool - `True` upon successful deletion. """ raise NotImplementedError() @@ -175,8 +173,6 @@ def delete_metadata(self, pid, format_id): :param str pid: Authority-based identifier. :param str format_id: Metadata format. - - :return: bool - `True` upon successful deletion. """ raise NotImplementedError() diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index a71a8289..ef2944bf 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -990,7 +990,7 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): assert os.path.exists(cid_refs_file_path) -def test_delete_object_id_type_cid(pids, store): +def test_delete_object_idtype_cid(pids, store): """Test delete_object successfully deletes only object.""" test_dir = "tests/testdata/" entity = "objects" @@ -1001,6 +1001,23 @@ def test_delete_object_id_type_cid(pids, store): assert store._count(entity) == 0 +def test_delete_object_idtype_cid_refs_file_exists(pids, store): + """Test delete_object does not delete object if a cid refs file still exists.""" + test_dir = "tests/testdata/" + entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(object_metadata.cid, "cid") + assert store._count(entity) == 3 + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + def test_delete_object_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" pid = " " @@ -1026,8 +1043,7 @@ def test_delete_metadata(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - is_deleted = store.delete_metadata(pid, format_id) - assert is_deleted + store.delete_metadata(pid, format_id) assert store._count(entity) == 0 @@ -1054,8 +1070,7 @@ def test_delete_metadata_does_not_exist(pids, store): metadata that does not exist.""" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - is_deleted = store.delete_metadata(pid, format_id) - assert is_deleted + store.delete_metadata(pid, format_id) def test_delete_metadata_default_format_id(store, pids): From 7755a62cb2f829378a5e61a4f2cc0ccd7e8c3227 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 12:37:09 -0800 Subject: [PATCH 155/420] Add new static method '_get_file_paths' and refactor 'delete_metadata'; and clean up code --- src/hashstore/filehashstore.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a50655aa..cfd53235 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -701,7 +701,6 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) - # TODO: Refactor the way we store metadata and add pytests to confirm metadata_cid = self._put_metadata(metadata, pid, checked_format_id) logging.info( @@ -874,14 +873,9 @@ def delete_metadata(self, pid, format_id=None): metadata_rel_path = self._get_store_path("metadata") / rel_path if format_id is None: # Delete all metadata files - metadata_files = os.listdir(metadata_rel_path) - metadata_file_paths = [ - metadata_rel_path / file - for file in metadata_files - if os.path.isfile(metadata_rel_path / file) - ] + metadata_file_paths = self._get_file_paths(metadata_rel_path) for file_path in metadata_file_paths: - os.remove(file_path) + self._delete(entity, file_path) info_string = ( "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", @@ -2075,6 +2069,30 @@ def _get_store_path(self, entity): f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) + @staticmethod + def _get_file_paths(directory): + """Get the file paths of a given directory + + :param mixed directory: String or path to directory. + + :raises FileNotFoundError: If the directory doesn't exist + + :return: file_paths - File paths of the given directory + :rtype: List + """ + if os.path.exists(directory): + files = os.listdir(directory) + file_paths = [ + directory / file for file in files if os.path.isfile(directory / file) + ] + return file_paths + else: + err_msg = ( + "FileHashStore - _get_file_paths: Directory does not exist: %s", + directory, + ) + raise FileNotFoundError(err_msg) + def _count(self, entity): """Return the count of the number of files in the `root` directory. From cec0c5e8579705bdd4fac018a12a41204c8bfb11 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 13:55:29 -0800 Subject: [PATCH 156/420] Refactor 'delete_object' to coordinate deletion of files to improve atomicity of the process, and to also delete all metadata documents when 'id_type' is pid --- src/hashstore/filehashstore.py | 101 +++++++++++++++++++++++++-------- src/hashstore/hashstore.py | 4 +- 2 files changed, 79 insertions(+), 26 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index cfd53235..94b10820 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -791,6 +791,15 @@ def delete_object(self, ab_id, id_type=None): else: # id_type is "pid" pid = ab_id + objects_to_delete = [] + rel_path = "/".join(self._shard(self._computehash(pid))) + metadata_rel_path = self._get_store_path("metadata") / rel_path + metadata_file_paths = self._get_file_paths(metadata_rel_path) + # Rename paths by appending _delete to the file name + if metadata_file_paths is not None: + for path in metadata_file_paths: + objects_to_delete.append(self._rename_path_for_deletion(path)) + try: cid = self.find_object(pid) except FileNotFoundError as fnfe: @@ -799,19 +808,46 @@ def delete_object(self, ab_id, id_type=None): return if "cid refs file not found" in fnfe: # Delete pid refs file - self._delete("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(self._resolve_path("pid", pid)) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) return if "object referenced does not exist" in fnfe: # Delete pid refs file + pid_ref_abs_path = self._resolve_path("pid", pid) + # Add pid refs file to delete + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove pid from cid refs file + # Retrieve the cid from the pid refs file + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + pid_refs_cid = pid_ref_file.read() + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + # Remove if the pid refs is found + if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + self._delete_cid_refs_pid(cid_ref_abs_path, pid) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) self._delete("pid", pid) return except ValueError as ve: if "is missing from cid refs file" in ve: # Delete pid refs file - self._delete("pid", pid) + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) return - # Proceed with next steps - cid has been retrieved without any errors + # Proceed with next steps - cid has been retrieved without any issues while cid in self.reference_locked_cids: logging.debug( "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", @@ -829,24 +865,29 @@ def delete_object(self, ab_id, id_type=None): cid_ref_abs_path = self._resolve_path("cid", cid) pid_ref_abs_path = self._resolve_path("pid", pid) # First delete the pid refs file immediately - self._delete_pid_refs_file(pid_ref_abs_path) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: - self._delete("cid", cid_ref_abs_path) - self._delete("objects", cid) - info_string = ( - "FileHashStore - delete_object: Successfully deleted references and" - + f" object associated with pid: {pid}" + objects_to_delete.append( + self._rename_path_for_deletion(cid_ref_abs_path) ) - logging.info(info_string) - else: - info_string = ( - "FileHashStore - delete_object: Successfully deleted pid refs file but" - + f" not object with cid ({cid}), cid refs file not empty." + obj_real_path = self._resolve_path("objects", cid) + objects_to_delete.append( + self._rename_path_for_deletion(obj_real_path) ) - logging.info(info_string) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + + info_string = ( + "FileHashStore - delete_object: Successfully deleted references, metadata and" + + f" object associated with pid: {pid}" + ) + logging.info(info_string) return finally: @@ -874,8 +915,9 @@ def delete_metadata(self, pid, format_id=None): if format_id is None: # Delete all metadata files metadata_file_paths = self._get_file_paths(metadata_rel_path) - for file_path in metadata_file_paths: - self._delete(entity, file_path) + if metadata_file_paths is not None: + for file_path in metadata_file_paths: + self._delete(entity, file_path) info_string = ( "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", @@ -1941,6 +1983,21 @@ def _delete(self, entity, file): logging.error(exception_string) raise err + @staticmethod + def _rename_path_for_deletion(path): + """Move and rename a given path by appending '_delete' to the file name + + :param Path path: Path to file to rename + + :return: Path to the renamed file + :rtype: str + """ + if isinstance(path, str): + path = Path(path) + delete_path = path.with_name(path.stem + "_delete" + path.suffix) + shutil.move(path, delete_path) + return delete_path + def _remove_empty(self, subpath): """Successively remove all empty folders starting with `subpath` and proceeding "up" through directory tree until reaching the `root` @@ -2071,13 +2128,13 @@ def _get_store_path(self, entity): @staticmethod def _get_file_paths(directory): - """Get the file paths of a given directory + """Get the file paths of a given directory if it exists :param mixed directory: String or path to directory. :raises FileNotFoundError: If the directory doesn't exist - :return: file_paths - File paths of the given directory + :return: file_paths - File paths of the given directory or None if directory doesn't exist :rtype: List """ if os.path.exists(directory): @@ -2087,11 +2144,7 @@ def _get_file_paths(directory): ] return file_paths else: - err_msg = ( - "FileHashStore - _get_file_paths: Directory does not exist: %s", - directory, - ) - raise FileNotFoundError(err_msg) + return None def _count(self, entity): """Return the count of the number of files in the `root` directory. diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 1e6e50c5..b94762a3 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -168,8 +168,8 @@ def delete_object(self, ab_id, id_type): @abstractmethod def delete_metadata(self, pid, format_id): """Deletes a metadata document (ex. `sysmeta`) permanently from HashStore using a given - persistent identifier and its respective metadata namespace. If a `format_id` is supplied, - only the metadata document associated with the `format_id` will be deleted. + persistent identifier (`pid`) and format_id (metadata namespace). If a `format_id` is + not supplied, all metadata documents associated with the given `pid` will be deleted. :param str pid: Authority-based identifier. :param str format_id: Metadata format. From caa522bba3189aa22e61634cb20317800892b868 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:05:00 -0800 Subject: [PATCH 157/420] Refactor 'delete_metadata' to be more atomic --- src/hashstore/filehashstore.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 94b10820..aa6ece13 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -914,10 +914,13 @@ def delete_metadata(self, pid, format_id=None): metadata_rel_path = self._get_store_path("metadata") / rel_path if format_id is None: # Delete all metadata files + objects_to_delete = [] metadata_file_paths = self._get_file_paths(metadata_rel_path) if metadata_file_paths is not None: - for file_path in metadata_file_paths: - self._delete(entity, file_path) + for path in metadata_file_paths: + objects_to_delete.append(self._rename_path_for_deletion(path)) + for obj in objects_to_delete: + os.remove(obj) info_string = ( "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", From 4707283669871fcf35d9dda49552abffd14b8bbf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:22:11 -0800 Subject: [PATCH 158/420] Merge methods '_write_cid_refs_file' and '_write_pid_refs_file' into one method '_write_refs_file' and update pytests --- src/hashstore/filehashstore.py | 74 ++++++++------------------ tests/test_filehashstore_references.py | 42 +++++++-------- 2 files changed, 44 insertions(+), 72 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index aa6ece13..ec0767d2 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -578,7 +578,7 @@ def tag_object(self, pid, cid): raise FileExistsError(exception_string) elif os.path.exists(cid_ref_abs_path): # Create the pid refs file - pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") self._create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists @@ -594,8 +594,8 @@ def tag_object(self, pid, cid): else: # All ref files begin as tmp files and get moved sequentially at once # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) - cid_tmp_file_path = self._write_cid_refs_file(tmp_root_path, pid) + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' self._create_path(os.path.dirname(pid_ref_abs_path)) self._create_path(os.path.dirname(cid_ref_abs_path)) @@ -1340,35 +1340,37 @@ def delete_tmp_file(): # TODO: Clean up refs file methods, a lot of redundant code - def _write_cid_refs_file(self, path, pid): - """Write the CID reference file in the supplied path to a file. A reference file - contains every PID that references a CID, each on its own line. This method will - only write into an empty file and will not overwrite an existing one. + def _write_refs_file(self, path, ref_id, ref_type): + """Write a reference file in the supplied path into a temporary file. + All `pid` or `cid` reference files begin with a single identifier, with the + primary difference being that a cid reference file can contain multiple lines + of `pid`s that reference the `cid`. - :param str path: Path of the file to be written into. - :param str pid: Authority-based or persistent identifier of the object. + :param str path: Directory to write the temporary file + :param str ref_id: Authority-based, persistent or content identifier - :return: cid_tmp_file_path - Path to the cid tmp file + :return: tmp_file_path - Path to the tmp refs file :rtype: string """ logging.debug( - "FileHashStore - write_cid_refs_file: Writing pid (%s) into file: %s", - pid, + "FileHashStore - write_cid_refs_file: Writing id (%s) into file: %s", + ref_id, path, ) try: - with self._mktmpfile(path) as cid_tmp_file: - cid_tmp_file_path = cid_tmp_file.name - with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: - tmp_cid_ref_file.write(pid + "\n") - # Ensure that file is immediately written to and not held in memory - # tmp_cid_ref_file.flush() - return cid_tmp_file_path + with self._mktmpfile(path) as tmp_file: + tmp_file_path = tmp_file.name + with open(tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: + if ref_type is "cid": + tmp_cid_ref_file.write(ref_id + "\n") + if ref_type is "pid": + tmp_cid_ref_file.write(ref_id) + return tmp_file_path except Exception as err: exception_string = ( - "FileHashStore - write_cid_refs_file: failed to write cid refs file for pid:" - + f" {pid} into path: {path}. Unexpected {err=}, {type(err)=}" + "FileHashStore - _write_refs_file: failed to write cid refs file for pid:" + + f" {ref_id} into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err @@ -1464,36 +1466,6 @@ def _delete_cid_refs_pid(cid_ref_abs_path, pid): logging.error(exception_string) raise err - def _write_pid_refs_file(self, path, cid): - """Generate a tmp pid refs file into the given path for the given CID (content - identifier). A reference file for a PID contains the CID that it references. - - :param str path: Path of the file to be written into. - :param str cid: Content identifier. - - :return: pid_tmp_file_path - :rtype: string - """ - logging.debug( - "FileHashStore - _write_pid_refs_file: Writing cid (%s) into file: %s", - cid, - path, - ) - try: - with self._mktmpfile(path) as pid_tmp_file: - pid_tmp_file_path = pid_tmp_file.name - with open(pid_tmp_file_path, "w", encoding="utf8") as pid_ref_file: - pid_ref_file.write(cid) - return pid_tmp_file_path - - except Exception as err: - exception_string = ( - f"FileHashStore - _write_pid_refs_file: failed to write cid ({cid})" - + f" into pid refs file: {path}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err - def _delete_pid_refs_file(self, pid_ref_abs_path): """Delete a PID reference file. diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 92f5ae59..08d0c79a 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -247,18 +247,18 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): store.verify_object(object_metadata, checksum, "md2", expected_file_size) -def test_write_cid_refs_file(store): +def test_write_refs_file_cid(store): """Test that write_cid_reference writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "test_pid") + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "test_pid", "cid") assert os.path.exists(tmp_cid_refs_file) -def test_write_cid_refs_file_content(pids, store): +def test_write_refs_file_cid_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") with open(tmp_cid_refs_file, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() @@ -269,7 +269,7 @@ def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") pid_other = "dou.test.1" store._update_cid_refs(tmp_cid_refs_file, pid_other) @@ -283,7 +283,7 @@ def test_update_cid_refs_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") cid_reference_list = [pid] for i in range(0, 5): @@ -305,7 +305,7 @@ def test_update_cid_refs_content_pid_exists(pids, store): and proceeds to complete the tagging process (verify_object)""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # Exception should not be thrown store._update_cid_refs(tmp_cid_refs_file, pid) @@ -323,7 +323,7 @@ def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") pid_other = "dou.test.1" store._update_cid_refs(tmp_cid_refs_file, pid_other) @@ -340,7 +340,7 @@ def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # First remove the pid store._delete_cid_refs_pid(tmp_cid_refs_file, pid) @@ -348,21 +348,21 @@ def test_delete_cid_refs_pid_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -def test_write_pid_refs_file(pids, store): +def test_write_refs_file_pid(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") assert os.path.exists(tmp_pid_refs_file) -def test_write_pid_refs_file_content(pids, store): +def test_write_refs_file_content_pid(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") with open(tmp_pid_refs_file, "r", encoding="utf8") as f: pid_refs_cid = f.read() @@ -374,7 +374,7 @@ def test_delete_pid_refs_file(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") store._delete_pid_refs_file(tmp_pid_refs_file) assert not os.path.exists(tmp_pid_refs_file) @@ -402,7 +402,7 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): cid = pids[pid]["sha256"] # Write the cid refs file and move it where it needs to be tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") cid_ref_abs_path = store._resolve_path("cid", cid) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) @@ -410,7 +410,7 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): pid_ref_abs_path = store._resolve_path("pid", pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(ValueError): @@ -424,7 +424,7 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): pid_ref_abs_path = store._resolve_path("pid", pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(FileNotFoundError): @@ -438,7 +438,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): cid = pids[pid]["sha256"] # Get a tmp cid refs file and write the wrong pid into it tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") cid_ref_abs_path = store._resolve_path("cid", cid) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) @@ -446,7 +446,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): pid_ref_abs_path = store._resolve_path("pid", pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(ValueError): @@ -462,7 +462,7 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi cid = pids[pid]["sha256"] # Write the wrong pid into a cid refs file and move it where it needs to be tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") cid_ref_abs_path = store._resolve_path("cid", cid) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) @@ -470,7 +470,7 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi pid_ref_abs_path = store._resolve_path("pid", pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) cid_reference_list = [pid] From 1bb5dc5fc2a77275c2819124e0e01ffbe69660f4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:23:49 -0800 Subject: [PATCH 159/420] Remove redundant '_delete_pid_refs_file' method and update pytests --- src/hashstore/filehashstore.py | 28 -------------------------- tests/test_filehashstore_references.py | 19 ----------------- 2 files changed, 47 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ec0767d2..b5d96bb6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1466,34 +1466,6 @@ def _delete_cid_refs_pid(cid_ref_abs_path, pid): logging.error(exception_string) raise err - def _delete_pid_refs_file(self, pid_ref_abs_path): - """Delete a PID reference file. - - :param str pid_ref_abs_path: Absolute path to the PID reference file. - """ - logging.debug( - "FileHashStore - _delete_pid_refs_file: Deleting reference file: %s", - pid_ref_abs_path, - ) - - try: - if not os.path.exists(pid_ref_abs_path): - err_msg = ( - "FileHashStore - _delete_pid_refs_file: pid reference file not found: %s", - pid_ref_abs_path, - ) - raise FileNotFoundError(err_msg) - else: - self._delete("pid", pid_ref_abs_path) - - except Exception as err: - exception_string = ( - "FileHashStore - _delete_pid_refs_file: failed to delete pid refs file:" - + f" {pid_ref_abs_path}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err - def _put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 08d0c79a..ef2cecb4 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -369,25 +369,6 @@ def test_write_refs_file_content_pid(pids, store): assert cid == pid_refs_cid -def test_delete_pid_refs_file(pids, store): - """Test that delete_pid_refs_file deletes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") - store._delete_pid_refs_file(tmp_pid_refs_file) - - assert not os.path.exists(tmp_pid_refs_file) - - -def test_delete_pid_refs_file_file_not_found(pids, store): - """Test that delete_pid_refs_file raises an exception when refs file not found.""" - for pid in pids.keys(): - pid_ref_abs_path = store._resolve_path("pid", pid) - with pytest.raises(FileNotFoundError): - store._delete_pid_refs_file(pid_ref_abs_path) - - def test_verify_hashstore_references_pid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when pid refs file is missing.""" for pid in pids.keys(): From 0c38cc77084f3eadb0e89e71173ba588522f6a16 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:55:18 -0800 Subject: [PATCH 160/420] Merge methods '_delete_cid_refs_pid' and '_update_cid_refs' into one method '_update_refs_file' and update pytests --- src/hashstore/filehashstore.py | 98 ++++++++++---------------- tests/test_filehashstore_interface.py | 2 +- tests/test_filehashstore_references.py | 32 ++++----- 3 files changed, 54 insertions(+), 78 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b5d96bb6..d02ab961 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -583,7 +583,7 @@ def tag_object(self, pid, cid): shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists if not self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): - self._update_cid_refs(cid_ref_abs_path, pid) + self._update_refs_file(cid_ref_abs_path, pid, "add") self._verify_hashstore_references(pid, cid, "update") logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", @@ -829,7 +829,7 @@ def delete_object(self, ab_id, id_type=None): cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) # Remove if the pid refs is found if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): - self._delete_cid_refs_pid(cid_ref_abs_path, pid) + self._update_refs_file(cid_ref_abs_path, pid, "remove") # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) @@ -869,7 +869,7 @@ def delete_object(self, ab_id, id_type=None): self._rename_path_for_deletion(pid_ref_abs_path) ) # Remove pid from cid reference file - self._delete_cid_refs_pid(cid_ref_abs_path, pid) + self._update_refs_file(cid_ref_abs_path, pid, "remove") # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: objects_to_delete.append( @@ -1393,75 +1393,51 @@ def _is_pid_in_cid_refs_file(self, pid, cid_ref_abs_path): return False @staticmethod - def _update_cid_refs(cid_ref_abs_path, pid): - """Update an existing CID reference file with the given PID. + def _update_refs_file(refs_file_path, ref_id, update_type): + """Add or remove an existing ref from a refs file. - :param str cid_ref_abs_path: Absolute path to the CID reference file. - :param str pid: Authority-based or persistent identifier of the object. + :param str refs_file_path: Absolute path to the refs file. + :param str ref_id: Authority-based or persistent identifier of the object. + :param str update_type: 'add' or 'remove' """ - logging.debug( - "FileHashStore - update_cid_refs: Adding pid (%s) into cid reference file: %s", - pid, - cid_ref_abs_path, + debug_msg = ( + f"FileHashStore - _update_refs_file: Updating ({update_type}) for ref_id: {ref_id}" + + f" at refs file: {refs_file_path}." ) - if not os.path.exists(cid_ref_abs_path): + logging.debug(debug_msg) + if not os.path.exists(refs_file_path): exception_string = ( - f"FileHashStore - update_cid_refs: {cid_ref_abs_path} does not exist." - + f" Cannot write pid: {[pid]}" + f"FileHashStore - _update_refs_file: {refs_file_path} does not exist." + + f" Cannot {update_type} ref_id: {ref_id}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) try: - with open(cid_ref_abs_path, "a", encoding="utf8") as cid_ref_file: - # Lock file for the shortest amount of time possible - file_descriptor = cid_ref_file.fileno() - fcntl.flock(file_descriptor, fcntl.LOCK_EX) - cid_ref_file.write(pid + "\n") - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) - except Exception as err: - exception_string = ( - "FileHashStore - update_cid_refs: failed to update reference for cid:" - + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err - - @staticmethod - def _delete_cid_refs_pid(cid_ref_abs_path, pid): - """Delete a PID from a CID reference file. - - :param str cid_ref_abs_path: Absolute path to the CID reference file. - :param str pid: Authority-based or persistent identifier of the object. - """ - logging.debug( - "FileHashStore - _delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", - pid, - cid_ref_abs_path, - ) - try: - with open(cid_ref_abs_path, "r+", encoding="utf8") as cid_ref_file: - # Lock file immediately, this process needs to complete - # before any others read/modify the content of cid_ref_file - file_descriptor = cid_ref_file.fileno() - fcntl.flock(file_descriptor, fcntl.LOCK_EX) - new_pid_lines = [ - cid_pid_line - for cid_pid_line in cid_ref_file.readlines() - if cid_pid_line.strip() != pid - ] - cid_ref_file.seek(0) - cid_ref_file.writelines(new_pid_lines) - cid_ref_file.truncate() - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) + if update_type is "add": + with open(refs_file_path, "a", encoding="utf8") as ref_file: + # Lock file for the shortest amount of time possible + file_descriptor = ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) + ref_file.write(ref_id + "\n") + if update_type is "remove": + with open(refs_file_path, "r+", encoding="utf8") as ref_file: + # Lock file immediately, this process needs to complete + # before any others read/modify the content of resf file + file_descriptor = ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) + new_pid_lines = [ + cid_pid_line + for cid_pid_line in ref_file.readlines() + if cid_pid_line.strip() != ref_id + ] + ref_file.seek(0) + ref_file.writelines(new_pid_lines) + ref_file.truncate() except Exception as err: exception_string = ( - "FileHashStore - _delete_cid_refs_pid: failed to remove pid from cid refs file:" - + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + f"FileHashStore - _update_refs_file: failed to {update_type} for ref_id: {ref_id}" + + f" at refs file: {refs_file_path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index ef2944bf..c165cec8 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -984,7 +984,7 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): cid = object_metadata.cid cid_refs_abs_path = store._resolve_path("cid", cid) # pylint: disable=W0212 - store._update_cid_refs(cid_refs_abs_path, "dou.test.1") + store._update_refs_file(cid_refs_abs_path, "dou.test.1", "add") store.delete_object(pid) cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index ef2cecb4..28c9a08d 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -97,7 +97,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): assert not os.path.exists(second_cid_hash) -def test_tag_object_cid_refs_update_cid_refs_updated(store): +def test_tag_object_cid_refs_update_refs_file_updated(store): """Test tag object updates a cid reference file that already exists.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -150,7 +150,7 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): # Manually update the cid refs, pid refs file missing at this point additional_pid = "dou.test.1" cid_ref_abs_path = store._resolve_path("cid", cid) - store._update_cid_refs(cid_ref_abs_path, additional_pid) + store._update_refs_file(cid_ref_abs_path, additional_pid, "add") # Confirm the pid refs file is missing pid_refs_file_path = store._resolve_path("pid", additional_pid) @@ -265,13 +265,13 @@ def test_write_refs_file_cid_content(pids, store): assert pid == cid_ref_file_pid.strip() -def test_update_cid_refs_content(pids, store): +def test_update_refs_file_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") pid_other = "dou.test.1" - store._update_cid_refs(tmp_cid_refs_file, pid_other) + store._update_refs_file(tmp_cid_refs_file, pid_other, "add") with open(tmp_cid_refs_file, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -279,7 +279,7 @@ def test_update_cid_refs_content(pids, store): assert value == pid or value == pid_other -def test_update_cid_refs_content_multiple(pids, store): +def test_update_refs_file_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" @@ -287,7 +287,7 @@ def test_update_cid_refs_content_multiple(pids, store): cid_reference_list = [pid] for i in range(0, 5): - store._update_cid_refs(tmp_cid_refs_file, f"dou.test.{i}") + store._update_refs_file(tmp_cid_refs_file, f"dou.test.{i}", "add") cid_reference_list.append(f"dou.test.{i}") line_count = 0 @@ -300,34 +300,34 @@ def test_update_cid_refs_content_multiple(pids, store): assert line_count == 6 -def test_update_cid_refs_content_pid_exists(pids, store): +def test_update_refs_file_content_pid_exists(pids, store): """Test that update_cid_ref does not throw exception if pid already exists and proceeds to complete the tagging process (verify_object)""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # Exception should not be thrown - store._update_cid_refs(tmp_cid_refs_file, pid) + store._update_refs_file(tmp_cid_refs_file, pid, "add") -def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): +def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store._resolve_path("cid", cid) with pytest.raises(FileNotFoundError): - store._update_cid_refs(cid_ref_abs_path, pid) + store._update_refs_file(cid_ref_abs_path, pid, "add") -def test_delete_cid_refs_pid(pids, store): +def test_update_refs_file_remove(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") pid_other = "dou.test.1" - store._update_cid_refs(tmp_cid_refs_file, pid_other) - store._delete_cid_refs_pid(tmp_cid_refs_file, pid) + store._update_refs_file(tmp_cid_refs_file, pid_other, "add") + store._update_refs_file(tmp_cid_refs_file, pid, "remove") with open(tmp_cid_refs_file, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -336,13 +336,13 @@ def test_delete_cid_refs_pid(pids, store): assert value == pid_other -def test_delete_cid_refs_pid_file(pids, store): +def test_update_refs_file_remove_file(pids, store): """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # First remove the pid - store._delete_cid_refs_pid(tmp_cid_refs_file, pid) + store._update_refs_file(tmp_cid_refs_file, pid, "remove") assert os.path.exists(tmp_cid_refs_file) assert os.path.getsize(tmp_cid_refs_file) == 0 @@ -456,7 +456,7 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi cid_reference_list = [pid] for i in range(0, 5): - store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + store._update_refs_file(cid_ref_abs_path, f"dou.test.{i}", "add") cid_reference_list.append(f"dou.test.{i}") with pytest.raises(ValueError): From 5cd09082eab9c9ecd3a07a9f2f0d9117b412cd49 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:59:00 -0800 Subject: [PATCH 161/420] Rename '_is_pid_in_cid_refs_file' to '_is_string_in_refs_file' --- src/hashstore/filehashstore.py | 43 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d02ab961..7584e873 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -582,7 +582,7 @@ def tag_object(self, pid, cid): self._create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists - if not self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + if not self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "add") self._verify_hashstore_references(pid, cid, "update") logging.info( @@ -637,7 +637,7 @@ def find_object(self, pid): cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file - if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + if self._is_string_in_refs_file(pid, cid_ref_abs_path): # Object must also exist in order to return the cid retrieved if not self._exists("objects", pid_refs_cid): err_msg = ( @@ -828,7 +828,7 @@ def delete_object(self, ab_id, id_type=None): pid_refs_cid = pid_ref_file.read() cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) # Remove if the pid refs is found - if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + if self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "remove") # Remove all files confirmed for deletion for obj in objects_to_delete: @@ -1375,23 +1375,6 @@ def _write_refs_file(self, path, ref_id, ref_type): logging.error(exception_string) raise err - def _is_pid_in_cid_refs_file(self, pid, cid_ref_abs_path): - """Check a cid reference file for a pid. - - :param str pid: Authority-based or persistent identifier of the object. - :param str cid_ref_abs_path: Path to the cid refs file - - :return: pid_found - :rtype: boolean - """ - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: - # Confirm that pid is not currently already tagged - for line in cid_ref_file: - value = line.strip() - if pid == value: - return True - return False - @staticmethod def _update_refs_file(refs_file_path, ref_id, update_type): """Add or remove an existing ref from a refs file. @@ -1442,6 +1425,24 @@ def _update_refs_file(refs_file_path, ref_id, update_type): logging.error(exception_string) raise err + @staticmethod + def _is_string_in_refs_file(ref_id, refs_file_path): + """Check a reference file for a ref_id (`cid` or `pid`). + + :param str pid: Authority-based, persistent identifier or content identifier + :param str refs_file_path: Path to the refs file + + :return: pid_found + :rtype: boolean + """ + with open(refs_file_path, "r", encoding="utf8") as ref_file: + # Confirm that pid is not currently already tagged + for line in ref_file: + value = line.strip() + if ref_id == value: + return True + return False + def _put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. @@ -1643,7 +1644,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): logging.error(exception_string) raise ValueError(exception_string) # Then the pid - pid_found = self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path) + pid_found = self._is_string_in_refs_file(pid, cid_ref_abs_path) if not pid_found: exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" From 10a51445928c596728d811ddd89b579e7bc8570b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 15:11:32 -0800 Subject: [PATCH 162/420] Cleanup 'FileHashStore' module --- src/hashstore/filehashstore.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7584e873..fb227eb7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -787,17 +787,19 @@ def delete_object(self, ab_id, id_type=None): # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): self._delete("objects", ab_id) - return else: # id_type is "pid" pid = ab_id + # Create a list of objects to delete to minimize delay objects_to_delete = [] + # Get the metadata documents to delete rel_path = "/".join(self._shard(self._computehash(pid))) metadata_rel_path = self._get_store_path("metadata") / rel_path metadata_file_paths = self._get_file_paths(metadata_rel_path) - # Rename paths by appending _delete to the file name + # Add these files to be permanently deleted if metadata_file_paths is not None: for path in metadata_file_paths: + # Rename files by appending _delete to the file name objects_to_delete.append(self._rename_path_for_deletion(path)) try: @@ -816,15 +818,14 @@ def delete_object(self, ab_id, id_type=None): os.remove(obj) return if "object referenced does not exist" in fnfe: - # Delete pid refs file + # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) - # Add pid refs file to delete objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) # Remove pid from cid refs file - # Retrieve the cid from the pid refs file with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + # Retrieve the cid pid_refs_cid = pid_ref_file.read() cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) # Remove if the pid refs is found @@ -833,11 +834,10 @@ def delete_object(self, ab_id, id_type=None): # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) - self._delete("pid", pid) return except ValueError as ve: if "is missing from cid refs file" in ve: - # Delete pid refs file + # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) @@ -864,7 +864,7 @@ def delete_object(self, ab_id, id_type=None): try: cid_ref_abs_path = self._resolve_path("cid", cid) pid_ref_abs_path = self._resolve_path("pid", pid) - # First delete the pid refs file immediately + # Add pid refs file to be permanently deleted objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -1343,11 +1343,12 @@ def delete_tmp_file(): def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the - primary difference being that a cid reference file can contain multiple lines - of `pid`s that reference the `cid`. + difference being that a cid reference file can potentially contain multiple + lines of `pid`s that reference the `cid`. :param str path: Directory to write the temporary file :param str ref_id: Authority-based, persistent or content identifier + :param str ref_type: 'cid' or 'pid' :return: tmp_file_path - Path to the tmp refs file :rtype: string @@ -1429,7 +1430,7 @@ def _update_refs_file(refs_file_path, ref_id, update_type): def _is_string_in_refs_file(ref_id, refs_file_path): """Check a reference file for a ref_id (`cid` or `pid`). - :param str pid: Authority-based, persistent identifier or content identifier + :param str ref_id: Authority-based, persistent identifier or content identifier :param str refs_file_path: Path to the refs file :return: pid_found @@ -1909,7 +1910,7 @@ def _delete(self, entity, file): @staticmethod def _rename_path_for_deletion(path): - """Move and rename a given path by appending '_delete' to the file name + """Rename a given path by appending '_delete' and move it to the renamed path. :param Path path: Path to file to rename From 77150db8ac3cd2b42ea680a158269ce6de87d2ca Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 16:23:12 -0800 Subject: [PATCH 163/420] Review & clean up 'test_filehashstore_interface' module and revise exception class thrown in FileHashStore for clarity --- src/hashstore/filehashstore.py | 8 +- tests/test_filehashstore.py | 2 +- tests/test_filehashstore_interface.py | 162 ++++++++++++++------------ 3 files changed, 93 insertions(+), 79 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index fb227eb7..194fdb00 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1204,14 +1204,14 @@ def _move_and_get_checksums( tmp_file_size, file_size_to_validate, ) - except Exception as ge: + except ValueError as ve: # If any exception is thrown during validation, exception_string = ( "FileHashStore - _move_and_get_checksums: Object exists but cannot be verified" - + f" (validation error): {abs_file_path}, deleting temporary file. Error: {ge}" + + f" (validation error): {abs_file_path}, deleting temporary file. Error: {ve}" ) logging.error(exception_string) - raise FileExistsError from ge + raise ValueError from ve finally: # Delete the temporary file, it already exists so it is redundant self._delete(entity, tmp_file_name) @@ -1338,8 +1338,6 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - # TODO: Clean up refs file methods, a lot of redundant code - def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 05afb37d..05888251 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -431,7 +431,7 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - with pytest.raises(FileExistsError): + with pytest.raises(ValueError): # pylint: disable=W0212 store._move_and_get_checksums( pid, diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index c165cec8..d0c21713 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,4 +1,5 @@ """Test module for FileHashStore HashStore interface methods.""" + import io import os from pathlib import Path @@ -18,23 +19,8 @@ ) -def test_pids_length(pids): - """Ensure test harness pids are present.""" - assert len(pids) == 3 - - -def test_store_address_length(pids, store): - """Test store object object_cid length is 64 characters.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - object_cid = object_metadata.cid - assert len(object_cid) == 64 - - -def test_store_object(pids, store): - """Test store object.""" +def test_store_object_refs_files_and_object(pids, store): + """Test store object stores objects and creates reference files.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -60,7 +46,7 @@ def test_store_object_only_object(pids, store): def test_store_object_files_path(pids, store): - """Test store object when given a path.""" + """Test store object when given a path object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -71,7 +57,7 @@ def test_store_object_files_path(pids, store): def test_store_object_files_string(pids, store): - """Test store object when given a string.""" + """Test store object when given a string object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -82,7 +68,7 @@ def test_store_object_files_string(pids, store): def test_store_object_files_input_stream(pids, store): - """Test store object given an input stream.""" + """Test store object when given a stream object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -94,8 +80,8 @@ def test_store_object_files_input_stream(pids, store): assert store._count(entity) == 3 -def test_store_object_id(pids, store): - """Test store object returns expected id.""" +def test_store_object_cid(pids, store): + """Test store object returns expected content identifier.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -103,6 +89,15 @@ def test_store_object_id(pids, store): assert object_metadata.cid == pids[pid][store.algorithm] +def test_store_object_pid(pids, store): + """Test store object returns expected persistent identifier.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + assert object_metadata.pid == pid + + def test_store_object_obj_size(pids, store): """Test store object returns expected file size.""" test_dir = "tests/testdata/" @@ -149,7 +144,7 @@ def test_store_object_data_incorrect_type_none(store): pid = "jtao.1700.1" path = None with pytest.raises(TypeError): - store.store_object(pid, path) + store.store_object(pid, data=path) def test_store_object_data_incorrect_type_empty(store): @@ -157,7 +152,7 @@ def test_store_object_data_incorrect_type_empty(store): pid = "jtao.1700.1" path = "" with pytest.raises(TypeError): - store.store_object(pid, path) + store.store_object(pid, data=path) def test_store_object_data_incorrect_type_empty_spaces(store): @@ -165,7 +160,7 @@ def test_store_object_data_incorrect_type_empty_spaces(store): pid = "jtao.1700.1" path = " " with pytest.raises(TypeError): - store.store_object(pid, path) + store.store_object(pid, data=path) def test_store_object_additional_algorithm_invalid(store): @@ -179,7 +174,7 @@ def test_store_object_additional_algorithm_invalid(store): def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): - """Test store object formats a given algorithm that's in uppercase.""" + """Test store object formats an additional algorithm in uppercase.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -192,7 +187,7 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): - """Test store object with additional algorithm in lowercase.""" + """Test store object formats an with additional algorithm in lowercase.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -208,7 +203,7 @@ def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): def test_store_object_additional_algorithm_underscore(pids, store): - """Test store object with additional algorithm with underscore.""" + """Test store object with formats an additional algorithm with underscore.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -224,7 +219,7 @@ def test_store_object_additional_algorithm_underscore(pids, store): def test_store_object_checksum_correct(store): - """Test store object successfully stores with good checksum.""" + """Test store object does not throw exception with good checksum.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -240,7 +235,7 @@ def test_store_object_checksum_correct(store): def test_store_object_checksum_correct_and_additional_algo(store): - """Test store object successfully stores with good checksum and same additional algorithm.""" + """Test store object with good checksum and an additional algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -264,7 +259,7 @@ def test_store_object_checksum_correct_and_additional_algo(store): def test_store_object_checksum_correct_and_additional_algo_duplicate(store): - """Test store object successfully stores with good checksum and same additional algorithm.""" + """Test store object does not throw exception with duplicate algorithms.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -296,7 +291,8 @@ def test_store_object_checksum_algorithm_empty(store): def test_store_object_checksum_empty(store): - """Test store object raises error when checksum_algorithm supplied with empty checksum.""" + """Test store object raises error when checksum_algorithm supplied with + an empty checksum.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -308,8 +304,8 @@ def test_store_object_checksum_empty(store): def test_store_object_checksum_empty_spaces(store): - """Test store object raises error when checksum_algorithm supplied and checksum is empty - with spaces.""" + """Test store object raises error when checksum_algorithm supplied and + checksum is empty with spaces.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -321,7 +317,8 @@ def test_store_object_checksum_empty_spaces(store): def test_store_object_checksum_algorithm_empty_spaces(store): - """Test store object raises error when checksum supplied with no checksum_algorithm.""" + """Test store object raises error when checksum is supplied and with empty + spaces as the checksum_algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -335,7 +332,7 @@ def test_store_object_checksum_algorithm_empty_spaces(store): def test_store_object_checksum_incorrect_checksum(store): - """Test store object raises error when supplied with bad checksum.""" + """Test store object raises error when supplied with incorrect checksum.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -364,8 +361,8 @@ def test_store_object_duplicate_does_not_store_duplicate(store): assert store._count(entity) == 1 -def test_store_object_duplicate_references_files(pids, store): - """Test that storing duplicate object but different pid creates the expected +def test_store_object_duplicate_object_references_file_count(store): + """Test that storing a duplicate object but with different pids creates the expected amount of reference files.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -382,15 +379,9 @@ def test_store_object_duplicate_references_files(pids, store): assert store._count("pid") == 3 # Confirm that there are 1 cid reference files assert store._count("cid") == 1 - # Confirm the content of the cid refence files - cid_ref_abs_path = store._resolve_path("cid", pids[pid][store.algorithm]) - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - assert value == pid or value == pid_two or value == pid_three -def test_store_object_duplicate_references_content(pids, store): +def test_store_object_duplicate_object_references_file_content(pids, store): """Test that storing duplicate object but different pid creates the expected amount of reference files.""" test_dir = "tests/testdata/" @@ -410,13 +401,10 @@ def test_store_object_duplicate_references_content(pids, store): for _, line in enumerate(f, start=1): value = line.strip() assert value == pid or value == pid_two or value == pid_three - print(os.listdir(store.root + "/refs/pid/")) - assert len(os.listdir(store.root + "/refs/pid")) == 3 - assert len(os.listdir(store.root + "/refs/cid")) == 1 def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, store): - """Test store duplicate object throws FileExistsError when object exists + """Test store duplicate object throws ValueError when object exists but the data to validate against is incorrect.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -425,7 +413,7 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor # Store first blob _object_metadata_one = store.store_object(pid, path) # Store second blob - with pytest.raises(FileExistsError): + with pytest.raises(ValueError): _object_metadata_two = store.store_object( pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) @@ -434,7 +422,7 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor def test_store_object_with_obj_file_size(store, pids): - """Test store object with correct file sizes.""" + """Test store object stores object with correct file sizes.""" test_dir = "tests/testdata/" for pid in pids.keys(): obj_file_size = pids[pid]["file_size_bytes"] @@ -447,7 +435,7 @@ def test_store_object_with_obj_file_size(store, pids): def test_store_object_with_obj_file_size_incorrect(store, pids): - """Test store object throws exception with incorrect file size.""" + """Test store object throws exception with incorrect file sizes.""" test_dir = "tests/testdata/" for pid in pids.keys(): obj_file_size = 1234 @@ -457,7 +445,8 @@ def test_store_object_with_obj_file_size_incorrect(store, pids): def test_store_object_with_obj_file_size_non_integer(store, pids): - """Test store object throws exception with a non integer value as the file size.""" + """Test store object throws exception with a non integer value (ex. a stirng) + as the file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): obj_file_size = "Bob" @@ -666,7 +655,7 @@ def test_store_metadata(pids, store): assert store._count(entity) == 3 -def test_store_metadata_one_pid_multiple_metadata_documents(store): +def test_store_metadata_one_pid_multiple_docs_correct_location(store): """Test store metadata for a pid with multiple metadata documents.""" test_dir = "tests/testdata/" entity = "metadata" @@ -712,7 +701,7 @@ def test_store_metadata_default_format_id(pids, store): def test_store_metadata_files_string(pids, store): - """Test store metadata with string.""" + """Test store metadata with a string object to the metadata.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -739,7 +728,7 @@ def test_store_metadata_files_input_stream(pids, store): def test_store_metadata_pid_empty(store): - """Test store metadata raises error with empty string.""" + """Test store metadata raises error with an empty string as the pid.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = "" @@ -750,7 +739,7 @@ def test_store_metadata_pid_empty(store): def test_store_metadata_pid_empty_spaces(store): - """Test store metadata raises error with empty spaces.""" + """Test store metadata raises error with empty spaces as the pid.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " @@ -761,7 +750,7 @@ def test_store_metadata_pid_empty_spaces(store): def test_store_metadata_pid_format_id_spaces(store): - """Test store metadata raises error with empty spaces.""" + """Test store metadata raises error with empty spaces as the format_id.""" test_dir = "tests/testdata/" format_id = " " pid = "jtao.1700.1" @@ -772,7 +761,7 @@ def test_store_metadata_pid_format_id_spaces(store): def test_store_metadata_metadata_empty(store): - """Test store metadata raises error with empty metadata string.""" + """Test store metadata raises error with empty spaces as the metadata path.""" pid = "jtao.1700.1" format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = " " @@ -781,7 +770,7 @@ def test_store_metadata_metadata_empty(store): def test_store_metadata_metadata_none(store): - """Test store metadata raises error with empty None metadata.""" + """Test store metadata raises error with empty None metadata path.""" pid = "jtao.1700.1" format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = None @@ -790,7 +779,7 @@ def test_store_metadata_metadata_none(store): def test_store_metadata_metadata_path(pids, store): - """Test store metadata returns expected metadata_cid.""" + """Test store metadata returns expected path to metadata document.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -831,7 +820,7 @@ def test_store_metadata_thread_lock(store): def test_retrieve_object(pids, store): - """Test retrieve_object returns correct object data.""" + """Test retrieve_object returns a stream to the correct object data.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -862,7 +851,7 @@ def test_retrieve_object_pid_invalid(store): def test_retrieve_metadata(store): - """Test retrieve_metadata returns correct metadata.""" + """Test retrieve_metadata returns a stream to the correct metadata.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" @@ -879,7 +868,7 @@ def test_retrieve_metadata(store): def test_retrieve_metadata_default_format_id(store): - """Test retrieve_metadata retrieves expected metadata with default format_id.""" + """Test retrieve_metadata retrieves expected metadata without a format_id.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -912,7 +901,7 @@ def test_retrieve_metadata_bytes_pid_empty(store): def test_retrieve_metadata_format_id_empty(store): - """Test retrieve_metadata raises error when supplied with empty format_id.""" + """Test retrieve_metadata raises error when supplied with an empty format_id.""" format_id = "" pid = "jtao.1700.1" with pytest.raises(ValueError): @@ -920,17 +909,45 @@ def test_retrieve_metadata_format_id_empty(store): def test_retrieve_metadata_format_id_empty_spaces(store): - """Test retrieve_metadata raises error when supplied with empty spaces format_id.""" + """Test retrieve_metadata raises error when supplied with empty spaces asthe format_id.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): store.retrieve_metadata(pid, format_id) -def test_delete_object(pids, store): - """Test delete_object successfully deletes objects from /objects and all refs files.""" +def test_delete_object_object_deleted(pids, store): + """Test delete_object successfully deletes object.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + assert store._count("objects") == 0 + + +def test_delete_object_metadata_deleted(pids, store): + """Test delete_object successfully deletes relevant metadata + files and refs files.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + assert store._count("metadata") == 0 + + +def test_delete_object_refs_files_deleted(pids, store): + """Test delete_object successfully deletes refs files.""" test_dir = "tests/testdata/" - entity = "objects" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -939,12 +956,11 @@ def test_delete_object(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - assert store._count(entity) == 0 assert store._count("pid") == 0 assert store._count("cid") == 0 -def test_delete_object_pid_refs_file(pids, store): +def test_delete_object_pid_refs_file_deleted(pids, store): """Test delete_object deletes the associated pid refs file for the object.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -959,7 +975,7 @@ def test_delete_object_pid_refs_file(pids, store): assert not os.path.exists(pid_refs_file_path) -def test_delete_object_cid_refs_file(pids, store): +def test_delete_object_cid_refs_file_deleted(pids, store): """Test delete_object deletes the associated cid refs file for the object.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -976,7 +992,7 @@ def test_delete_object_cid_refs_file(pids, store): def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): - """Test delete_object does not delete the cid refs file that still contains ref.""" + """Test delete_object does not delete the cid refs file that still contains refs.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") From 83f8b60972a40a4eab1e58f8cd59b51747951eb0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 17:06:54 -0800 Subject: [PATCH 164/420] Review & clean up 'test_filehashstore_references' module and add guard rail to '_update_refs_file' to ensure duplicate pids are not written when 'update_type' is 'add' --- src/hashstore/filehashstore.py | 17 ++++---- tests/test_filehashstore_references.py | 57 +++++++++++++++----------- 2 files changed, 41 insertions(+), 33 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 194fdb00..79576ac1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1,4 +1,5 @@ """Core module for FileHashStore""" + import atexit import io import shutil @@ -1374,8 +1375,7 @@ def _write_refs_file(self, path, ref_id, ref_type): logging.error(exception_string) raise err - @staticmethod - def _update_refs_file(refs_file_path, ref_id, update_type): + def _update_refs_file(self, refs_file_path, ref_id, update_type): """Add or remove an existing ref from a refs file. :param str refs_file_path: Absolute path to the refs file. @@ -1394,14 +1394,15 @@ def _update_refs_file(refs_file_path, ref_id, update_type): ) logging.error(exception_string) raise FileNotFoundError(exception_string) - try: if update_type is "add": - with open(refs_file_path, "a", encoding="utf8") as ref_file: - # Lock file for the shortest amount of time possible - file_descriptor = ref_file.fileno() - fcntl.flock(file_descriptor, fcntl.LOCK_EX) - ref_file.write(ref_id + "\n") + pid_found = self._is_string_in_refs_file(ref_id, refs_file_path) + if not pid_found: + with open(refs_file_path, "a", encoding="utf8") as ref_file: + # Lock file for the shortest amount of time possible + file_descriptor = ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) + ref_file.write(ref_id + "\n") if update_type is "remove": with open(refs_file_path, "r+", encoding="utf8") as ref_file: # Lock file immediately, this process needs to complete diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 28c9a08d..be8bfa30 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -1,4 +1,5 @@ """Test module for FileHashStore's reference system to tag stored objects.""" + import os import shutil import pytest @@ -7,7 +8,7 @@ def test_tag_object(pids, store): - """Test tag object returns boolean.""" + """Test tag object returns true boolean when successful.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -17,7 +18,7 @@ def test_tag_object(pids, store): def test_tag_object_pid_refs_file(pids, store): - """Test tag object creates the pid reference file.""" + """Test tag object creates the expected pid reference file.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -44,7 +45,7 @@ def test_tag_object_pid_refs_file_exists(pids, store): def test_tag_object_pid_refs_file_content(pids, store): - """Test tag object creates the pid reference file contains the correct cid.""" + """Test tag object created the pid reference file with the expected cid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -69,7 +70,7 @@ def test_tag_object_cid_refs_file(pids, store): def test_tag_object_cid_refs_file_content(pids, store): - """Test tag object tags cid reference file successfully with pid.""" + """Test tag object creates the cid reference file successfully with pid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -82,8 +83,8 @@ def test_tag_object_cid_refs_file_content(pids, store): def test_tag_object_cid_refs_file_exists(pids, store): - """Test tag object raises exception when trying to add another cid to an - existing pid reference file and that a cid reference file is not created.""" + """Test tag object raises exception when trying to tag a pid that already + has a pid refs file, and that a cid reference file is not created.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -164,7 +165,7 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): def test_verify_object(pids, store): - """Test verify object succeeds given good arguments.""" + """Test verify_object succeeds given good arguments.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -178,7 +179,7 @@ def test_verify_object(pids, store): def test_verify_object_exception_incorrect_object_metadata_type(pids, store): - """Test verify object raises exception when incorrect object is given to + """Test verify_object returns false when incorrect object is given to object_metadata arg.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -194,7 +195,7 @@ def test_verify_object_exception_incorrect_object_metadata_type(pids, store): def test_verify_object_exception_incorrect_size(pids, store): - """Test verify object raises exception when incorrect size is supplied.""" + """Test verify_object returns false when incorrect size is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -214,7 +215,7 @@ def test_verify_object_exception_incorrect_size(pids, store): def test_verify_object_exception_incorrect_checksum(pids, store): - """Test verify object raises exception when incorrect checksum is supplied.""" + """Test verify_object returns false when incorrect checksum is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -236,7 +237,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): def test_verify_object_exception_incorrect_checksum_algo(pids, store): - """Test verify object raises exception when incorrect algorithm is supplied.""" + """Test verify_object returns false when incorrect algorithm is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -247,15 +248,15 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): store.verify_object(object_metadata, checksum, "md2", expected_file_size) -def test_write_refs_file_cid(store): - """Test that write_cid_reference writes a reference file.""" +def test_write_refs_file_ref_type_cid(store): + """Test that write_refs_file writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "test_pid", "cid") assert os.path.exists(tmp_cid_refs_file) -def test_write_refs_file_cid_content(pids, store): - """Test that write_cid_ref_file writes the expected content.""" +def test_write_refs_file_ref_type_cid_content(pids, store): + """Test that write_refs_file writes the expected content.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -266,7 +267,7 @@ def test_write_refs_file_cid_content(pids, store): def test_update_refs_file_content(pids, store): - """Test that update_cid_ref updates the ref file as expected.""" + """Test that update_refs_file updates the ref file as expected.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -280,7 +281,7 @@ def test_update_refs_file_content(pids, store): def test_update_refs_file_content_multiple(pids, store): - """Test that update_cid_refs adds multiple references successfully.""" + """Test that _update_refs_file adds multiple references successfully.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -301,17 +302,23 @@ def test_update_refs_file_content_multiple(pids, store): def test_update_refs_file_content_pid_exists(pids, store): - """Test that update_cid_ref does not throw exception if pid already exists - and proceeds to complete the tagging process (verify_object)""" + """Test that _update_refs_file does add a pid to a refs file that already + contains the pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # Exception should not be thrown store._update_refs_file(tmp_cid_refs_file, pid, "add") + line_count = 0 + with open(tmp_cid_refs_file, "r", encoding="utf8") as ref_file: + for _line in ref_file: + line_count += 1 + assert line_count == 1 + def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): - """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" + """Test that _update_refs_file throws exception if refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store._resolve_path("cid", cid) @@ -320,7 +327,7 @@ def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): def test_update_refs_file_remove(pids, store): - """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" + """Test that _update_refs_file deletes the given pid from the ref file.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -336,8 +343,8 @@ def test_update_refs_file_remove(pids, store): assert value == pid_other -def test_update_refs_file_remove_file(pids, store): - """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" +def test_update_refs_file_empty_file(pids, store): + """Test that _update_refs_file leaves a file empty when removing the last pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -348,7 +355,7 @@ def test_update_refs_file_remove_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -def test_write_refs_file_pid(pids, store): +def test_write_refs_file_ref_type_pid(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] @@ -357,7 +364,7 @@ def test_write_refs_file_pid(pids, store): assert os.path.exists(tmp_pid_refs_file) -def test_write_refs_file_content_pid(pids, store): +def test_write_refs_file_ref_type_content_pid(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): cid = pids[pid]["sha256"] From 0baaafb7b35096aaa43209967c12fbef2bd1493e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 7 Feb 2024 09:19:19 -0800 Subject: [PATCH 165/420] Fix incorrect usage of python 'is' and replace with '==' --- src/hashstore/filehashstore.py | 10 +++++----- tests/test_filehashstore.py | 7 ++----- tests/test_hashstore_client.py | 2 ++ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 79576ac1..94d835c3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -783,7 +783,7 @@ def delete_object(self, ab_id, id_type=None): "FileHashStore - delete_object: Request to delete object for id: %s", ab_id ) self._check_string(ab_id, "ab_id", "delete_object") - if id_type is "cid": + if id_type == "cid": cid_refs_abs_path = self._resolve_path("cid", ab_id) # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): @@ -1361,9 +1361,9 @@ def _write_refs_file(self, path, ref_id, ref_type): with self._mktmpfile(path) as tmp_file: tmp_file_path = tmp_file.name with open(tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: - if ref_type is "cid": + if ref_type == "cid": tmp_cid_ref_file.write(ref_id + "\n") - if ref_type is "pid": + if ref_type == "pid": tmp_cid_ref_file.write(ref_id) return tmp_file_path @@ -1395,7 +1395,7 @@ def _update_refs_file(self, refs_file_path, ref_id, update_type): logging.error(exception_string) raise FileNotFoundError(exception_string) try: - if update_type is "add": + if update_type == "add": pid_found = self._is_string_in_refs_file(ref_id, refs_file_path) if not pid_found: with open(refs_file_path, "a", encoding="utf8") as ref_file: @@ -1403,7 +1403,7 @@ def _update_refs_file(self, refs_file_path, ref_id, update_type): file_descriptor = ref_file.fileno() fcntl.flock(file_descriptor, fcntl.LOCK_EX) ref_file.write(ref_id + "\n") - if update_type is "remove": + if update_type == "remove": with open(refs_file_path, "r+", encoding="utf8") as ref_file: # Lock file immediately, this process needs to complete # before any others read/modify the content of resf file diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 05888251..1f6a2b01 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1,4 +1,5 @@ """Test module for FileHashStore init, core, utility and supporting methods.""" + import io import os from pathlib import Path @@ -6,11 +7,7 @@ from hashstore.filehashstore import FileHashStore # pylint: disable=W0212 - - -def test_pids_length(pids): - """Ensure test harness pids are present.""" - assert len(pids) == 3 +# TODO: To Review def test_init_directories_created(store): diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index be4e5f7a..1b270c7e 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -1,10 +1,12 @@ """Test module for the Python client (Public API calls only).""" + import sys import os from pathlib import Path from hashstore import hashstoreclient # pylint: disable=W0212 +# TODO: To Review def test_create_hashstore(tmp_path): From e4ca96741edb03bd436c884e197d5b339e4c2547 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 7 Feb 2024 14:07:12 -0800 Subject: [PATCH 166/420] Refactor 'tag_object' to handle orphaned pid refs file, add new method '_tag_pid_cid_and_verify_refs_files' and update/add new pytests --- src/hashstore/filehashstore.py | 110 +++++++++++++--- tests/test_filehashstore.py | 2 +- tests/test_filehashstore_interface.py | 9 +- tests/test_filehashstore_references.py | 166 +++++++++++++++++++------ tests/test_hashstore_client.py | 1 - 5 files changed, 217 insertions(+), 71 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 94d835c3..1168464f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -570,13 +570,68 @@ def tag_object(self, pid, cid): # Proceed to tagging process if os.path.exists(pid_ref_abs_path): - # A pid reference file can only contain one cid - exception_string = ( - "FileHashStore - write_pid_refs_file: pid ref file already exists for" - + pid_ref_abs_path + # A pid reference file can only contain and reference one cid + # First, confirm that the expected cid refs file exists by getting the cid + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + pid_refs_cid = pid_ref_file.read() + + if pid_refs_cid != cid: + # If it's not equal to the given cid, determine if it's an orphan file + expected_cid_refs_path = self._resolve_path("cid", pid_refs_cid) + if os.path.exists( + expected_cid_refs_path + ) and self._is_string_in_refs_file(pid, expected_cid_refs_path): + # Throw exception, this pid is accounted for + exception_string = ( + "FileHashStore - tag_object: Pid refs file exists with valid pid" + + f" and cid reference files for pid: {pid} with cid: {cid}." + ) + logging.error(exception_string) + raise FileExistsError(exception_string) + # Now check the expected cid refs file + cid_ref_exists = os.path.exists(cid_ref_abs_path) + if cid_ref_exists and self._is_string_in_refs_file( + pid, cid_ref_abs_path + ): + self._verify_hashstore_references(pid, cid, "update") + else: + # Pid is not found in the cid reference file + if cid_ref_exists: + self._update_refs_file(cid_ref_abs_path, pid, "add") + else: + # Overwrite existing pid refs file, it is an orphaned file + print("****OVERWRITING EXISTING FILES****") + self._tag_pid_cid_and_verify_refs_files( + pid, + cid, + pid_ref_abs_path, + cid_ref_abs_path, + tmp_root_path, + ) + logging.info( + "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", + cid, + pid, + ) + return True + + # Check to see if the given cid's respective refs file exists + if os.path.exists(cid_ref_abs_path): + if not self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "add") + else: + # Create cid refs file + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + self._create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure everything is where it needs to be + self._verify_hashstore_references(pid, cid, "update") + logging.info( + "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", + cid, + pid, ) - logging.error(exception_string) - raise FileExistsError(exception_string) + return True elif os.path.exists(cid_ref_abs_path): # Create the pid refs file pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") @@ -593,20 +648,9 @@ def tag_object(self, pid, cid): ) return True else: - # All ref files begin as tmp files and get moved sequentially at once - # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_ref_abs_path)) - self._create_path(os.path.dirname(cid_ref_abs_path)) - # Move both files - shutil.move(pid_tmp_file_path, pid_ref_abs_path) - shutil.move(cid_tmp_file_path, cid_ref_abs_path) - # Ensure that the reference files have been written as expected - # If there is an issue, client or user will have to manually review - self._verify_hashstore_references(pid, cid, "create") - + self._tag_pid_cid_and_verify_refs_files( + pid, cid, pid_ref_abs_path, cid_ref_abs_path, tmp_root_path + ) logging.info( "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", cid, @@ -1339,6 +1383,32 @@ def delete_tmp_file(): os.umask(oldmask) return tmp + def _tag_pid_cid_and_verify_refs_files( + self, pid, cid, pid_ref_abs_path, cid_ref_abs_path, tmp_root_path + ): + """Create temporary pid and cid reference files, move them into their expected + locations and verify the content. + + :param str pid: Authority-based or persistent identifier + :param str cid: Content identifier + :param str pid_ref_abs_path: Permanent address to pid refs file + :param str pid_ref_abs_path: Permanent address to pid refs file + :param str tmp_root_path: Path to folder to create temporary ref files + """ + # All ref files begin as tmp files and get moved sequentially at once + # Get tmp files with the expected cid and pid refs content + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' + self._create_path(os.path.dirname(pid_ref_abs_path)) + self._create_path(os.path.dirname(cid_ref_abs_path)) + # Move both files + shutil.move(pid_tmp_file_path, pid_ref_abs_path) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure that the reference files have been written as expected + # If there is an issue, client or user will have to manually review + self._verify_hashstore_references(pid, cid, "create") + def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1f6a2b01..32b4d029 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -890,7 +890,7 @@ def test_open_objects(pids, store): io_buffer.close() -def test_delete_by_object_metadata_id(pids, store): +def test_delete_with_object_metadata_id(pids, store): """Check objects are deleted after calling delete with object id.""" test_dir = "tests/testdata/" entity = "objects" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index d0c21713..7dd20324 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -472,14 +472,8 @@ def test_store_object_duplicates_threads(pids, store): path = test_dir + pid entity = "objects" - file_exists_error_flag = False - def store_object_wrapper(obj_pid, obj_path): - nonlocal file_exists_error_flag - try: - store.store_object(obj_pid, obj_path) # Call store_object inside the thread - except FileExistsError: - file_exists_error_flag = True + store.store_object(obj_pid, obj_path) # Call store_object inside the thread thread1 = Thread(target=store_object_wrapper, args=(pid, path)) thread2 = Thread(target=store_object_wrapper, args=(pid, path)) @@ -493,7 +487,6 @@ def store_object_wrapper(obj_pid, obj_path): # One thread will succeed, file count must still be 1 assert store._count(entity) == 1 assert store._exists(entity, pids[pid][store.algorithm]) - assert file_exists_error_flag @slow_test diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index be8bfa30..a39a6ee1 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -7,8 +7,20 @@ # pylint: disable=W0212 +def test_tag_pid_cid_and_verify_refs_files(store): + """Check that refs files are moved to where they are expected to be.""" + pid = "dou.test.pid" + cid = "dou.test.cid" + pid_refs_file_path = store._resolve_path("pid", pid) + cid_refs_file_path = store._resolve_path("cid", cid) + tmp_root_path = store._get_store_path("refs") / "tmp" + store._tag_pid_cid_and_verify_refs_files( + pid, cid, pid_refs_file_path, cid_refs_file_path, tmp_root_path + ) + + def test_tag_object(pids, store): - """Test tag object returns true boolean when successful.""" + """Test tag_object returns true boolean when successful.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -17,8 +29,8 @@ def test_tag_object(pids, store): assert object_tagged -def test_tag_object_pid_refs_file(pids, store): - """Test tag object creates the expected pid reference file.""" +def test_tag_object_pid_refs_file_exists(pids, store): + """Test tag_object creates the expected pid reference file.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -28,8 +40,21 @@ def test_tag_object_pid_refs_file(pids, store): assert os.path.exists(pid_refs_file_path) -def test_tag_object_pid_refs_file_exists(pids, store): - """Test tag object throws exception when pid refs file already exists.""" +def test_tag_object_cid_refs_file_exists(pids, store): + """Test tag_object creates the cid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + cid = object_metadata.cid + store.tag_object(pid, object_metadata.cid) + cid_refs_file_path = store._resolve_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + +def test_tag_object_refs_file_exists(pids, store): + """Test tag_object does not throws exception when pid refs file already exists + and verifies the content.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -40,37 +65,42 @@ def test_tag_object_pid_refs_file_exists(pids, store): assert os.path.exists(pid_refs_file_path) cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) - with pytest.raises(FileExistsError): - store.tag_object(pid, cid) + store.tag_object(pid, cid) -def test_tag_object_pid_refs_file_content(pids, store): - """Test tag object created the pid reference file with the expected cid.""" +def test_tag_object_refs_file_exists_cid_is_not_double_tagged(pids, store): + """Test tag_object succeeds when trying to tag a pid that already has a pid + refs file, and that a cid reference file that already contains cid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store._resolve_path("pid", pid) - with open(pid_refs_file_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - assert pid_refs_cid == object_metadata.cid + store.tag_object(pid, object_metadata.cid) + + cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) + line_count = 0 + with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: + for _line in ref_file: + line_count += 1 + assert line_count == 1 -def test_tag_object_cid_refs_file(pids, store): - """Test tag object creates the cid reference file.""" +def test_tag_object_pid_refs_file_content(pids, store): + """Test tag_object created the pid reference file with the expected cid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - cid = object_metadata.cid store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store._resolve_path("cid", cid) - assert os.path.exists(cid_refs_file_path) + pid_refs_file_path = store._resolve_path("pid", pid) + with open(pid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + assert pid_refs_cid == object_metadata.cid def test_tag_object_cid_refs_file_content(pids, store): - """Test tag object creates the cid reference file successfully with pid.""" + """Test tag_object creates the cid reference file successfully with pid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -82,24 +112,8 @@ def test_tag_object_cid_refs_file_content(pids, store): assert pid_refs_cid == pid -def test_tag_object_cid_refs_file_exists(pids, store): - """Test tag object raises exception when trying to tag a pid that already - has a pid refs file, and that a cid reference file is not created.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) - another_cid = "dou.test.1" - with pytest.raises(FileExistsError): - store.tag_object(pid, another_cid) - - second_cid_hash = store._resolve_path("cid", another_cid) - assert not os.path.exists(second_cid_hash) - - def test_tag_object_cid_refs_update_refs_file_updated(store): - """Test tag object updates a cid reference file that already exists.""" + """Test tag_object updates a cid reference file that already exists.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") @@ -113,15 +127,18 @@ def test_tag_object_cid_refs_update_refs_file_updated(store): store.tag_object(additional_pid, cid) # Read cid file to confirm cid refs file contains the additional pid + line_count = 0 cid_ref_abs_path = store._resolve_path("cid", cid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() + line_count += 1 assert value == pid or value == additional_pid + assert line_count == 2 def test_tag_object_cid_refs_update_pid_refs_created(store): - """Test tag object creates a pid reference file when called to tag an object + """Test tag_object creates a pid reference file when called to tag an object that already exists.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -140,15 +157,15 @@ def test_tag_object_cid_refs_update_pid_refs_created(store): def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): - """Test that tag_object creates a missing pid refs file that somehow disappeared + """Test tag_object creates a missing pid refs file that somehow disappeared when called to tag a cid that already contains the pid.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) + object_metadata = store.store_object(pid, path) cid = object_metadata.cid - # Manually update the cid refs, pid refs file missing at this point + # Manually update the cid refs + # This means that the pid refs file missing at this point additional_pid = "dou.test.1" cid_ref_abs_path = store._resolve_path("cid", cid) store._update_refs_file(cid_ref_abs_path, additional_pid, "add") @@ -164,6 +181,73 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): assert os.path.exists(pid_refs_file_path) +def test_tag_object_pid_refs_found_but_cid_arg_is_different(store): + """Test that tag_object throws an exception when pid refs file exists, contains a + different cid, and is correctly referenced in the associated cid refs file""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + pid_ref_abs_path = store._resolve_path("pid", pid) + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + + # Attempt to tag the existing pid with valid refs file + with pytest.raises(FileExistsError): + store.tag_object(pid, "bad_cid_value") + + +def test_tag_object_pid_refs_found_but_missing_pid_in_cid_refs_file(store): + """Test tag_object completes as expected when pid refs file exists but is missing + (not tagged) from expected cid refs file.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._resolve_path("cid", cid) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + assert not store._is_string_in_refs_file(pid, cid_ref_abs_path) + + # Tag object, this should add the missing pid to the cid refs file + store.tag_object(pid, cid) + assert store._is_string_in_refs_file(pid, cid_ref_abs_path) + + # Confirm that there is only 1 of each expected file + assert store._count("objects") == 1 + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + +def test_tag_object_pid_refs_found_but_cid_refs_file_not_found(store): + """Test tag_object completes when a pid refs file exists but the expected + cid refs file somehow disappeared.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + # Delete the cid refs file + cid_ref_abs_path = store._resolve_path("cid", cid) + os.remove(cid_ref_abs_path) + assert not os.path.exists(cid_ref_abs_path) + + # Tag object, this should create the missing pid refs file + store.tag_object(pid, cid) + assert os.path.exists(cid_ref_abs_path) + + # Confirm that there is only 1 of each expected file + assert store._count("objects") == 1 + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + def test_verify_object(pids, store): """Test verify_object succeeds given good arguments.""" test_dir = "tests/testdata/" diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 1b270c7e..af9997ea 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -6,7 +6,6 @@ from hashstore import hashstoreclient # pylint: disable=W0212 -# TODO: To Review def test_create_hashstore(tmp_path): From 67e8c08643dede2208b244f7c1c4cb8a682334c5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 7 Feb 2024 16:22:50 -0800 Subject: [PATCH 167/420] Fix bug in 'delete_object' where exception cannot be parsed (cast str() on fnfe), revise '_verify_hashstor_references' to improve clarity and update affected code --- src/hashstore/filehashstore.py | 51 +++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1168464f..1cb4ac6d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -593,14 +593,20 @@ def tag_object(self, pid, cid): if cid_ref_exists and self._is_string_in_refs_file( pid, cid_ref_abs_path ): - self._verify_hashstore_references(pid, cid, "update") + self._verify_hashstore_references( + pid, cid, "Pid and cid refs found, verify refs files." + ) else: # Pid is not found in the cid reference file if cid_ref_exists: self._update_refs_file(cid_ref_abs_path, pid, "add") + self._verify_hashstore_references( + pid, + cid, + "Orphan pid refs file found, updating cid refs file.", + ) else: # Overwrite existing pid refs file, it is an orphaned file - print("****OVERWRITING EXISTING FILES****") self._tag_pid_cid_and_verify_refs_files( pid, cid, @@ -619,13 +625,20 @@ def tag_object(self, pid, cid): if os.path.exists(cid_ref_abs_path): if not self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "add") + self._verify_hashstore_references( + pid, + cid, + "Pid and cid refs file exists, update cid refs file.", + ) else: # Create cid refs file cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") self._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(cid_tmp_file_path, cid_ref_abs_path) - # Ensure everything is where it needs to be - self._verify_hashstore_references(pid, cid, "update") + # Ensure everything is where it needs to be + self._verify_hashstore_references( + pid, cid, "Pid refs file exists, create cid refs file." + ) logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", cid, @@ -640,7 +653,11 @@ def tag_object(self, pid, cid): # Update cid ref files as it already exists if not self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "add") - self._verify_hashstore_references(pid, cid, "update") + self._verify_hashstore_references( + pid, + cid, + "Pid refs file doesn't exist, but cid refs exists.", + ) logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", cid, @@ -850,10 +867,11 @@ def delete_object(self, ab_id, id_type=None): try: cid = self.find_object(pid) except FileNotFoundError as fnfe: - if "pid refs file not found" in fnfe: + fnfe_string = str(fnfe) + if "pid refs file not found" in fnfe_string: # Nothing to delete return - if "cid refs file not found" in fnfe: + if "cid refs file not found" in fnfe_string: # Delete pid refs file objects_to_delete.append( self._rename_path_for_deletion(self._resolve_path("pid", pid)) @@ -862,7 +880,7 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - if "object referenced does not exist" in fnfe: + if "object referenced does not exist" in fnfe_string: # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) objects_to_delete.append( @@ -881,7 +899,8 @@ def delete_object(self, ab_id, id_type=None): os.remove(obj) return except ValueError as ve: - if "is missing from cid refs file" in ve: + ve_string = str(ve) + if "is missing from cid refs file" in ve_string: # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) objects_to_delete.append( @@ -1407,7 +1426,7 @@ def _tag_pid_cid_and_verify_refs_files( shutil.move(cid_tmp_file_path, cid_ref_abs_path) # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review - self._verify_hashstore_references(pid, cid, "create") + self._verify_hashstore_references(pid, cid, "Created all refs files") def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. @@ -1674,13 +1693,13 @@ def _verify_object_information( logging.error(exception_string) raise ValueError(exception_string) - def _verify_hashstore_references(self, pid, cid, verify_type): + def _verify_hashstore_references(self, pid, cid, additional_log_string): """Verifies that the supplied pid and pid reference file and content have been written successfully. :param str pid: Authority-based or persistent identifier. :param str cid: Content identifier. - :param str verify_type: "update" or "create" + :param str additional_log_string: String to append to exception statement """ # Check that reference files were created pid_ref_abs_path = self._resolve_path("pid", pid) @@ -1689,7 +1708,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " + pid_ref_abs_path - + f" . Verify type {verify_type}" + + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) @@ -1697,7 +1716,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " + cid_ref_abs_path - + f" . Verify type {verify_type}" + + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) @@ -1709,7 +1728,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file exists" + f" ({pid_ref_abs_path}) but cid ({cid}) does not match." - + f"Verify type {verify_type}" + + f" Additional Context: {additional_log_string}" ) logging.error(exception_string) raise ValueError(exception_string) @@ -1719,7 +1738,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" + f" ({cid_ref_abs_path}) but pid ({pid}) not found." - + f" Verify type {verify_type}" + + f" Additional Context: {additional_log_string}" ) logging.error(exception_string) raise ValueError(exception_string) From b0afa840ccd32673ddb07e48c69ca47bf0a32cc0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 8 Feb 2024 15:47:15 -0800 Subject: [PATCH 168/420] Refactor 'tag_object' to improve clarity, remove redundant method and update pytests --- src/hashstore/filehashstore.py | 156 +++++++------------- tests/test_filehashstore_references.py | 195 ++++++------------------- 2 files changed, 98 insertions(+), 253 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1cb4ac6d..960b4583 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -564,23 +564,43 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: + tmp_root_path = self._get_store_path("refs") / "tmp" pid_ref_abs_path = self._resolve_path("pid", pid) cid_ref_abs_path = self._resolve_path("cid", cid) - tmp_root_path = self._get_store_path("refs") / "tmp" + pid_ref_abs_path_exists = os.path.exists(pid_ref_abs_path) + cid_ref_abs_path_exists = os.path.exists(cid_ref_abs_path) - # Proceed to tagging process - if os.path.exists(pid_ref_abs_path): + if pid_ref_abs_path_exists and cid_ref_abs_path_exists: + self._verify_hashstore_references( + pid, + cid, + "Refs file already exists, verifying.", + ) + return True + elif pid_ref_abs_path_exists and not cid_ref_abs_path_exists: # A pid reference file can only contain and reference one cid # First, confirm that the expected cid refs file exists by getting the cid with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() - if pid_refs_cid != cid: - # If it's not equal to the given cid, determine if it's an orphan file - expected_cid_refs_path = self._resolve_path("cid", pid_refs_cid) - if os.path.exists( - expected_cid_refs_path - ) and self._is_string_in_refs_file(pid, expected_cid_refs_path): + if pid_refs_cid == cid: + # The pid correctly references the given cid, but the cid refs file is missing + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + self._create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + self._verify_hashstore_references( + pid, cid, "Created missing cid refs file" + ) + return True + else: + # Check if the retrieved cid refs file exists and pid is referenced + retrieved_cid_refs_path = self._resolve_path("cid", pid_refs_cid) + retrieved_cid_refs_path_exists = os.path.exists( + retrieved_cid_refs_path + ) + if retrieved_cid_refs_path_exists and self._is_string_in_refs_file( + pid, retrieved_cid_refs_path + ): # Throw exception, this pid is accounted for exception_string = ( "FileHashStore - tag_object: Pid refs file exists with valid pid" @@ -588,64 +608,10 @@ def tag_object(self, pid, cid): ) logging.error(exception_string) raise FileExistsError(exception_string) - # Now check the expected cid refs file - cid_ref_exists = os.path.exists(cid_ref_abs_path) - if cid_ref_exists and self._is_string_in_refs_file( - pid, cid_ref_abs_path - ): - self._verify_hashstore_references( - pid, cid, "Pid and cid refs found, verify refs files." - ) - else: - # Pid is not found in the cid reference file - if cid_ref_exists: - self._update_refs_file(cid_ref_abs_path, pid, "add") - self._verify_hashstore_references( - pid, - cid, - "Orphan pid refs file found, updating cid refs file.", - ) - else: - # Overwrite existing pid refs file, it is an orphaned file - self._tag_pid_cid_and_verify_refs_files( - pid, - cid, - pid_ref_abs_path, - cid_ref_abs_path, - tmp_root_path, - ) - logging.info( - "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", - cid, - pid, - ) - return True - - # Check to see if the given cid's respective refs file exists - if os.path.exists(cid_ref_abs_path): - if not self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "add") - self._verify_hashstore_references( - pid, - cid, - "Pid and cid refs file exists, update cid refs file.", - ) - else: - # Create cid refs file - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - self._create_path(os.path.dirname(cid_ref_abs_path)) - shutil.move(cid_tmp_file_path, cid_ref_abs_path) - # Ensure everything is where it needs to be - self._verify_hashstore_references( - pid, cid, "Pid refs file exists, create cid refs file." - ) - logging.info( - "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", - cid, - pid, - ) - return True - elif os.path.exists(cid_ref_abs_path): + # Orphaned pid refs file found, the retrieved cid refs file exists + # but doesn't contain the cid. Proceed to overwrite the pid refs file. + # There is no return statement, so we move out of this if block. + elif not pid_ref_abs_path_exists and cid_ref_abs_path_exists: # Create the pid refs file pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") self._create_path(os.path.dirname(pid_ref_abs_path)) @@ -664,16 +630,26 @@ def tag_object(self, pid, cid): pid, ) return True - else: - self._tag_pid_cid_and_verify_refs_files( - pid, cid, pid_ref_abs_path, cid_ref_abs_path, tmp_root_path - ) - logging.info( - "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", - cid, - pid, - ) - return True + + # All ref files begin as tmp files and get moved sequentially at once + # Get tmp files with the expected cid and pid refs content + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' + self._create_path(os.path.dirname(pid_ref_abs_path)) + self._create_path(os.path.dirname(cid_ref_abs_path)) + # Move both files + shutil.move(pid_tmp_file_path, pid_ref_abs_path) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure that the reference files have been written as expected + # If there is an issue, client or user will have to manually review + self._verify_hashstore_references(pid, cid, "Created all refs files") + logging.info( + "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", + cid, + pid, + ) + return True finally: # Release cid with self.reference_lock: @@ -1402,32 +1378,6 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - def _tag_pid_cid_and_verify_refs_files( - self, pid, cid, pid_ref_abs_path, cid_ref_abs_path, tmp_root_path - ): - """Create temporary pid and cid reference files, move them into their expected - locations and verify the content. - - :param str pid: Authority-based or persistent identifier - :param str cid: Content identifier - :param str pid_ref_abs_path: Permanent address to pid refs file - :param str pid_ref_abs_path: Permanent address to pid refs file - :param str tmp_root_path: Path to folder to create temporary ref files - """ - # All ref files begin as tmp files and get moved sequentially at once - # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_ref_abs_path)) - self._create_path(os.path.dirname(cid_ref_abs_path)) - # Move both files - shutil.move(pid_tmp_file_path, pid_ref_abs_path) - shutil.move(cid_tmp_file_path, cid_ref_abs_path) - # Ensure that the reference files have been written as expected - # If there is an issue, client or user will have to manually review - self._verify_hashstore_references(pid, cid, "Created all refs files") - def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index a39a6ee1..87a30f26 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -7,18 +7,6 @@ # pylint: disable=W0212 -def test_tag_pid_cid_and_verify_refs_files(store): - """Check that refs files are moved to where they are expected to be.""" - pid = "dou.test.pid" - cid = "dou.test.cid" - pid_refs_file_path = store._resolve_path("pid", pid) - cid_refs_file_path = store._resolve_path("cid", cid) - tmp_root_path = store._get_store_path("refs") / "tmp" - store._tag_pid_cid_and_verify_refs_files( - pid, cid, pid_refs_file_path, cid_refs_file_path, tmp_root_path - ) - - def test_tag_object(pids, store): """Test tag_object returns true boolean when successful.""" test_dir = "tests/testdata/" @@ -52,40 +40,6 @@ def test_tag_object_cid_refs_file_exists(pids, store): assert os.path.exists(cid_refs_file_path) -def test_tag_object_refs_file_exists(pids, store): - """Test tag_object does not throws exception when pid refs file already exists - and verifies the content.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - store.tag_object(pid, cid) - pid_refs_file_path = store._resolve_path("pid", pid) - assert os.path.exists(pid_refs_file_path) - cid_refs_file_path = store._resolve_path("cid", cid) - assert os.path.exists(cid_refs_file_path) - store.tag_object(pid, cid) - - -def test_tag_object_refs_file_exists_cid_is_not_double_tagged(pids, store): - """Test tag_object succeeds when trying to tag a pid that already has a pid - refs file, and that a cid reference file that already contains cid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) - store.tag_object(pid, object_metadata.cid) - - cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) - line_count = 0 - with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: - for _line in ref_file: - line_count += 1 - assert line_count == 1 - - def test_tag_object_pid_refs_file_content(pids, store): """Test tag_object created the pid reference file with the expected cid.""" test_dir = "tests/testdata/" @@ -112,139 +66,80 @@ def test_tag_object_cid_refs_file_content(pids, store): assert pid_refs_cid == pid -def test_tag_object_cid_refs_update_refs_file_updated(store): - """Test tag_object updates a cid reference file that already exists.""" +def test_tag_object_pid_refs_found_cid_refs_found(pids, store): + """Test tag_object does not throws exception when the refs files already exist + and verifies the content, and does not double tag the cid refs file.""" test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - # Store data only - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - # Tag object - store.tag_object(pid, cid) - # Tag the cid with another pid - additional_pid = "dou.test.1" - store.tag_object(additional_pid, cid) - - # Read cid file to confirm cid refs file contains the additional pid - line_count = 0 - cid_ref_abs_path = store._resolve_path("cid", cid) - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - line_count += 1 - assert value == pid or value == additional_pid - assert line_count == 2 - - -def test_tag_object_cid_refs_update_pid_refs_created(store): - """Test tag_object creates a pid reference file when called to tag an object - that already exists.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - # Store data only - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - # Tag object - store.tag_object(pid, cid) - # Tag the cid with another pid - additional_pid = "dou.test.1" - store.tag_object(additional_pid, cid) + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + cid = object_metadata.cid + store.tag_object(pid, cid) + store.tag_object(pid, cid) - pid_refs_file_path = store._resolve_path("pid", additional_pid) - assert os.path.exists(pid_refs_file_path) + cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) + line_count = 0 + with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: + for _line in ref_file: + line_count += 1 + assert line_count == 1 -def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): - """Test tag_object creates a missing pid refs file that somehow disappeared - when called to tag a cid that already contains the pid.""" +def test_tag_object_pid_refs_found_cid_refs_not_found(store): + """Test that tag_object creates a missing cid refs file when called to tag a cid + with a pid whose associated pid refs file contains the given cid.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) cid = object_metadata.cid - # Manually update the cid refs - # This means that the pid refs file missing at this point - additional_pid = "dou.test.1" - cid_ref_abs_path = store._resolve_path("cid", cid) - store._update_refs_file(cid_ref_abs_path, additional_pid, "add") - - # Confirm the pid refs file is missing - pid_refs_file_path = store._resolve_path("pid", additional_pid) - assert not os.path.exists(pid_refs_file_path) - # Call tag_object, this should create the missing pid refs file - store.tag_object(additional_pid, cid) + # Manually delete the cid refs file, creating an orphaned pid + cid_ref_abs_path = store._resolve_path("cid", cid) + os.remove(cid_ref_abs_path) + assert store._count("cid") == 0 - # Confirm it has been created - assert os.path.exists(pid_refs_file_path) + store.tag_object(pid, cid) + assert store._count("pid") == 1 + assert store._count("cid") == 1 -def test_tag_object_pid_refs_found_but_cid_arg_is_different(store): +def test_tag_object_pid_refs_found_cid_refs_not_found_different_cid_retrieved(store): """Test that tag_object throws an exception when pid refs file exists, contains a different cid, and is correctly referenced in the associated cid refs file""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - pid_ref_abs_path = store._resolve_path("pid", pid) - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") - shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + _object_metadata = store.store_object(pid, path) - # Attempt to tag the existing pid with valid refs file with pytest.raises(FileExistsError): - store.tag_object(pid, "bad_cid_value") + store.tag_object(pid, "another_cid_value_that_is_not_found") -def test_tag_object_pid_refs_found_but_missing_pid_in_cid_refs_file(store): - """Test tag_object completes as expected when pid refs file exists but is missing - (not tagged) from expected cid refs file.""" +def test_tag_object_pid_refs_not_found_cid_refs_found(store): + """Test tag_object updates a cid reference file that already exists.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + # Store data only + object_metadata = store.store_object(None, path) cid = object_metadata.cid - - # Remove the pid from the cid refs file - cid_ref_abs_path = store._resolve_path("cid", cid) - store._update_refs_file(cid_ref_abs_path, pid, "remove") - assert not store._is_string_in_refs_file(pid, cid_ref_abs_path) - - # Tag object, this should add the missing pid to the cid refs file + # Tag object store.tag_object(pid, cid) - assert store._is_string_in_refs_file(pid, cid_ref_abs_path) - - # Confirm that there is only 1 of each expected file - assert store._count("objects") == 1 - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - -def test_tag_object_pid_refs_found_but_cid_refs_file_not_found(store): - """Test tag_object completes when a pid refs file exists but the expected - cid refs file somehow disappeared.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid + # Tag the cid with another pid + additional_pid = "dou.test.1" + store.tag_object(additional_pid, cid) - # Delete the cid refs file + # Read cid file to confirm cid refs file contains the additional pid + line_count = 0 cid_ref_abs_path = store._resolve_path("cid", cid) - os.remove(cid_ref_abs_path) - assert not os.path.exists(cid_ref_abs_path) - - # Tag object, this should create the missing pid refs file - store.tag_object(pid, cid) - assert os.path.exists(cid_ref_abs_path) - - # Confirm that there is only 1 of each expected file - assert store._count("objects") == 1 - assert store._count("pid") == 1 + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + line_count += 1 + assert value == pid or value == additional_pid + assert line_count == 2 + assert store._count("pid") == 2 assert store._count("cid") == 1 From 3eda728760b3484bd03db7832d4586697d43a7b7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 15:06:53 -0700 Subject: [PATCH 169/420] Update docstrings in hashstore.py --- src/hashstore/hashstore.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index b94762a3..cdb0b2bf 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -1,4 +1,5 @@ """Hashstore Interface""" + from abc import ABC, abstractmethod from collections import namedtuple import importlib.metadata @@ -25,12 +26,10 @@ def store_object( checksum_algorithm, expected_object_size, ): - """Atomic storage of objects to disk using a given stream. The `store_object` method - ensures atomic storage of objects to disk. Upon successful storage, it returns an - `ObjectMetadata` object containing relevant file information, such as the file's id - (used to locate the object on disk), the file's size, and a hex digest dictionary of - algorithms and checksums. The method also tags the object, creating references for - discoverability. + """Atomic storage of objects to disk using a given stream. Upon successful storage, + it returns an `ObjectMetadata` object containing relevant file information, such as + the file's id, the file's size, and a hex digest dictionary of algorithms and checksums. + The method also tags the object, creating references for discoverability. `store_object` ensures that an object is stored only once by synchronizing multiple calls and rejecting attempts to store duplicate objects. If called without a pid, it stores the @@ -67,9 +66,9 @@ def store_object( @abstractmethod def tag_object(self, pid, cid): - """Creates references that allow objects stored in HashStore to be discoverable. Retrieving, - deleting or calculating a hex digest of an object is based on a pid argument; and to - proceed, we must be able to find the object associated with the pid. + """Creates references that allow objects stored in HashStore to be discoverable. + Retrieving, deleting or calculating a hex digest of an object is based on a pid + argument, to proceed, we must be able to find the object associated with the pid. :param str pid: Authority-based or persistent identifier of the object. :param str cid: Content identifier of the object. @@ -112,8 +111,8 @@ def store_metadata(self, pid, metadata, format_id): `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` to determine the permanent address of the metadata object. All metadata documents for a given `pid` will be stored in a directory (under ../metadata) that is determined by - calculating the hash of the given pid, with the document name being the hash of the - metadata format (`format_id`). + calculating the hash of the given pid, with the document name being the hash of the pid + and metadata format (`pid` + `format_id`). Upon successful storage of metadata, the method returns a string representing the file's permanent address. Metadata objects are stored in parallel to objects in the @@ -161,7 +160,7 @@ def delete_object(self, ab_id, id_type): 'cid', only the object will be deleted if it is not referenced by other pids. :param str ab_id: Authority-based identifier. - :param str id_type: "pid" or "Cid + :param str id_type: "pid" or "cid" """ raise NotImplementedError() From a19d617ef074ab567dfbb99b139a3f8dd977860d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 15:41:59 -0700 Subject: [PATCH 170/420] Revise docstrings in 'test_filehashstore' module, and added new TODO items --- tests/test_filehashstore.py | 43 +++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 32b4d029..cd1e9fac 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -7,7 +7,6 @@ from hashstore.filehashstore import FileHashStore # pylint: disable=W0212 -# TODO: To Review def test_init_directories_created(store): @@ -154,7 +153,7 @@ def test_load_properties_hashstore_yaml_missing(store): def test_validate_properties(store): - """Confirm properties validated when all key/values are supplied.""" + """Confirm no exceptions are thrown when all key/values are supplied.""" properties = { "store_path": "/etc/test", "store_depth": 3, @@ -180,7 +179,7 @@ def test_validate_properties_missing_key(store): def test_validate_properties_key_value_is_none(store): - """Confirm exception raised when value from key is 'None'.""" + """Confirm exception raised when a value from a key is 'None'.""" properties = { "store_path": "/etc/test", "store_depth": 3, @@ -218,7 +217,7 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): - """Test _store_and_validate_data objects with path object for the path arg.""" + """Test _store_and_validate_data with path object for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -229,7 +228,7 @@ def test_store_and_validate_data_files_path(pids, store): def test_store_and_validate_data_files_string(pids, store): - """Test _store_and_validate_data objects with string for the path arg.""" + """Test _store_and_validate_data with string for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -240,7 +239,7 @@ def test_store_and_validate_data_files_string(pids, store): def test_store_and_validate_data_files_stream(pids, store): - """Test _store_and_validate_data objects with stream for the path arg.""" + """Test _store_and_validate_data with stream for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -288,7 +287,8 @@ def test_store_and_validate_data_hex_digests(pids, store): def test_store_and_validate_data_additional_algorithm(pids, store): - """Check _store_and_validate_data returns additional algorithm in hex digests.""" + """Check _store_and_validate_data returns additional algorithm in hex digests + when provided an additional algo value.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -364,7 +364,7 @@ def test_store_data_only_hex_digests(pids, store): def test_move_and_get_checksums_id(pids, store): - """Test move returns correct id.""" + """Test _move_and_get_checksums returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -380,7 +380,7 @@ def test_move_and_get_checksums_id(pids, store): def test_move_and_get_checksums_file_size(pids, store): - """Test move returns correct file size.""" + """Test _move_and_get_checksums returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -396,7 +396,7 @@ def test_move_and_get_checksums_file_size(pids, store): def test_move_and_get_checksums_hex_digests(pids, store): - """Test move returns correct hex digests.""" + """Test _move_and_get_checksums returns correct hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -415,8 +415,8 @@ def test_move_and_get_checksums_hex_digests(pids, store): assert hex_digests.get("sha512") == pids[pid]["sha512"] -def test_move_and_get_checksums_duplicates_raises_error(pids, store): - """Test move does not store duplicate objects and raises error.""" +def test_move_and_get_checksums_does_not_store_duplicate(pids, store): + """Test _move_and_get_checksums does not store duplicate objects.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -425,6 +425,19 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): # pylint: disable=W0212 store._move_and_get_checksums(pid, input_stream) input_stream.close() + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + input_stream = io.open(path, "rb") + # pylint: disable=W0212 + store._move_and_get_checksums(pid, input_stream) + input_stream.close() + assert store._count(entity) == 3 + + +def test_move_and_get_checksums_raises_error_with_nonmatching_checksum(pids, store): + """Test _move_and_get_checksums raises error when incorrect checksum supplied.""" + test_dir = "tests/testdata/" + entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") @@ -437,7 +450,7 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): checksum_algorithm="sha256", ) input_stream.close() - assert store._count(entity) == 3 + assert store._count(entity) == 0 def test_move_and_get_checksums_incorrect_file_size(pids, store): @@ -480,7 +493,7 @@ def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): """Test _write...hex_digests returns correct hex digests when given a checksum_algorithm - is provided.""" + and checksum.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -641,6 +654,7 @@ def test_put_metadata_with_string(pids, store): def test_put_metadata_cid(pids, store): """Test put metadata returns correct id.""" + # TODO: Review after fixing put_metadata's permanent address (pid+format_id) test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -1031,6 +1045,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): def test_get_real_path_with_metadata_id(store, pids): """Test get_real_path returns absolute path given a metadata id.""" + # TODO: Review after fixing put_metadata's permanent address (pid+format_id) entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" From 6f860d98da31469f44926412378b3a327f094c0f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 15:51:38 -0700 Subject: [PATCH 171/420] Refactor 'tag_object' to use '_is_string_in_refs_file' to confirm whether a cid is in a pid refs file for consistency --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 960b4583..373c752b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -583,7 +583,7 @@ def tag_object(self, pid, cid): with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() - if pid_refs_cid == cid: + if self._is_string_in_refs_file(cid, pid_ref_abs_path): # The pid correctly references the given cid, but the cid refs file is missing cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") self._create_path(os.path.dirname(cid_ref_abs_path)) From 3537e096c43d4b98f4769553911230e1021d4c06 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 15:52:26 -0700 Subject: [PATCH 172/420] Add missing docstring item in 'ObjectMetadata' class --- src/hashstore/hashstore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index cdb0b2bf..4e249018 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -248,6 +248,7 @@ class ObjectMetadata( the size of the object in bytes (`obj_size`), and an optional list of hex digests (`hex_digests`) to assist with validating objects. + :param str pid: An authority-based or persistent identifier :param str cid: A unique identifier for the object (Hash ID, hex digest). :param bytes obj_size: The size of the object in bytes. :param list hex_digests: A list of hex digests to validate objects From 66771d2376c27b1a5f636376752329e99f07a1bb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 16:15:42 -0700 Subject: [PATCH 173/420] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3f5681ec..cfb4cf25 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## HashStore: hash-based object storage for DataONE data packages -- **Author**: Matthew B. Jones, Dou Mok, Jing Tao, Matthew Brooke +- **Author**: Dou Mok, Matthew Brooke, Jing Tao, Matthew B. Jones - **License**: [Apache 2](http://opensource.org/licenses/Apache-2.0) - [Package source code on GitHub](https://github.com/DataONEorg/hashstore) - [**Submit Bugs and feature requests**](https://github.com/DataONEorg/hashstore/issues) @@ -113,8 +113,7 @@ tag_object(pid, cid) **How do I delete an object if I have the pid?** - To delete an object and all its associated reference files, call the Public API method `delete_object` with `id_type` 'pid'. -- To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it it is not referenced by any pids. -- To delete an object and all its related data (reference files and system metadata), call the Public API method `delete_object` with `id_type` 'clear'. +- To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it is not referenced by any pids. - Note, `delete_object` and `tag_object` calls are synchronized on their content identifier values so that the shared reference files are not unintentionally modified concurrently. An object that is in the process of being deleted should not be tagged, and vice versa. These calls have been implemented to occur sequentially to improve clarity in the event of an unexpected conflict or issue. From 1ccab7865672b5af726d8d026ab366ac019f0f8d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 10:58:20 -0700 Subject: [PATCH 174/420] Add custom exception for when pid refs file is not found and update affected methods and tests --- src/hashstore/filehashstore.py | 22 ++++++++++++++++++---- tests/test_filehashstore_interface.py | 8 +++++--- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 373c752b..e8c97530 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -707,7 +707,7 @@ def find_object(self, pid): f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " + pid_ref_abs_path ) - raise FileNotFoundError(err_msg) + raise PidRefsDoesNotExist(err_msg) def store_metadata(self, pid, metadata, format_id=None): logging.debug( @@ -842,11 +842,17 @@ def delete_object(self, ab_id, id_type=None): try: cid = self.find_object(pid) + except PidRefsDoesNotExist: + warn_msg = ( + "FileHashStore - delete_object: pid refs file does not exist for pid: " + + ab_id + + ". Skipping deletion request." + ) + logging.warning(warn_msg) + # Nothing to delete + return except FileNotFoundError as fnfe: fnfe_string = str(fnfe) - if "pid refs file not found" in fnfe_string: - # Nothing to delete - return if "cid refs file not found" in fnfe_string: # Delete pid refs file objects_to_delete.append( @@ -2248,3 +2254,11 @@ def close(self): self._obj.close() else: self._obj.seek(self._pos) + + +class PidRefsDoesNotExist(Exception): + """Custom exception thrown when a pid refs file does not exist.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 7dd20324..7bea57cf 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -9,6 +9,8 @@ import time import pytest +from hashstore.filehashstore import PidRefsDoesNotExist + # pylint: disable=W0212 @@ -612,7 +614,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): def test_find_object_pid_object_does_not_exist(store): """Test find object throws exception when object doesn't exist.""" - with pytest.raises(FileNotFoundError): + with pytest.raises(PidRefsDoesNotExist): store.find_object("dou.test.1") @@ -839,7 +841,7 @@ def test_retrieve_object_pid_invalid(store): """Test retrieve_object raises error when supplied with bad pid.""" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" - with pytest.raises(FileNotFoundError): + with pytest.raises(PidRefsDoesNotExist): store.retrieve_object(pid_does_not_exist) @@ -1142,7 +1144,7 @@ def test_get_hex_digest_pid_not_found(store): pid = "jtao.1700.1" pid_does_not_exist = pid + "test" algorithm = "sha256" - with pytest.raises(FileNotFoundError): + with pytest.raises(PidRefsDoesNotExist): store.get_hex_digest(pid_does_not_exist, algorithm) From 513f86db88ebb6f98d5fb385188cb1e806e254f4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 11:01:35 -0700 Subject: [PATCH 175/420] Add custom exception for when cid refs file is not found and update affected methods and tests --- src/hashstore/filehashstore.py | 28 +++++++++++++++++---------- tests/test_filehashstore_interface.py | 4 ++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e8c97530..3482c162 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -701,7 +701,7 @@ def find_object(self, pid): + f", but cid refs file not found: {cid_ref_abs_path}" ) logging.error(err_msg) - raise FileNotFoundError(err_msg) + raise CidRefsDoesNotExist(err_msg) else: err_msg = ( f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " @@ -851,17 +851,17 @@ def delete_object(self, ab_id, id_type=None): logging.warning(warn_msg) # Nothing to delete return + except CidRefsDoesNotExist: + # Delete pid refs file + objects_to_delete.append( + self._rename_path_for_deletion(self._resolve_path("pid", pid)) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return except FileNotFoundError as fnfe: fnfe_string = str(fnfe) - if "cid refs file not found" in fnfe_string: - # Delete pid refs file - objects_to_delete.append( - self._rename_path_for_deletion(self._resolve_path("pid", pid)) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return if "object referenced does not exist" in fnfe_string: # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) @@ -2262,3 +2262,11 @@ class PidRefsDoesNotExist(Exception): def __init__(self, message, errors=None): super().__init__(message) self.errors = errors + + +class CidRefsDoesNotExist(Exception): + """Custom exception thrown when a cid refs file does not exist.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 7bea57cf..2002da56 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -9,7 +9,7 @@ import time import pytest -from hashstore.filehashstore import PidRefsDoesNotExist +from hashstore.filehashstore import CidRefsDoesNotExist, PidRefsDoesNotExist # pylint: disable=W0212 @@ -608,7 +608,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): pid_ref_file.write("intentionally.wrong.pid") pid_ref_file.truncate() - with pytest.raises(FileNotFoundError): + with pytest.raises(CidRefsDoesNotExist): store.find_object(pid) From 4a602761b2de0bbffdc3f7a14151bc9c39256ef4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 11:12:30 -0700 Subject: [PATCH 176/420] Add custom exception for when refs file exists but object is not found, update affected methods and add new pytest --- src/hashstore/filehashstore.py | 49 +++++++++++++++------------ tests/test_filehashstore_interface.py | 25 ++++++++++++-- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3482c162..d5ddb44e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -683,7 +683,7 @@ def find_object(self, pid): + pid_ref_abs_path + f", but object referenced does not exist, cid: {pid_refs_cid}" ) - raise FileNotFoundError(err_msg) + raise RefsFileExistsButCidObjMissing(err_msg) else: return pid_refs_cid else: @@ -860,26 +860,24 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - except FileNotFoundError as fnfe: - fnfe_string = str(fnfe) - if "object referenced does not exist" in fnfe_string: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove pid from cid refs file - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - # Retrieve the cid - pid_refs_cid = pid_ref_file.read() - cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) - # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "remove") - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return + except RefsFileExistsButCidObjMissing: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove pid from cid refs file + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + # Retrieve the cid + pid_refs_cid = pid_ref_file.read() + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + # Remove if the pid refs is found + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "remove") + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return except ValueError as ve: ve_string = str(ve) if "is missing from cid refs file" in ve_string: @@ -2270,3 +2268,12 @@ class CidRefsDoesNotExist(Exception): def __init__(self, message, errors=None): super().__init__(message) self.errors = errors + + +class RefsFileExistsButCidObjMissing(Exception): + """Custom exception thrown when pid and cid refs file exists, but the + cid object does not.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 2002da56..98425957 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -9,7 +9,11 @@ import time import pytest -from hashstore.filehashstore import CidRefsDoesNotExist, PidRefsDoesNotExist +from hashstore.filehashstore import ( + CidRefsDoesNotExist, + PidRefsDoesNotExist, + RefsFileExistsButCidObjMissing, +) # pylint: disable=W0212 @@ -593,7 +597,22 @@ def test_find_object(pids, store): assert cid == object_metadata.hex_digests.get("sha256") -def test_find_object_pid_refs_cid_not_found(pids, store): +def test_find_object_refs_exist_but_obj_not_found(pids, store): + """Test find_object throws exception when refs file exist but the object does not.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, path) + + cid = store.find_object(pid) + obj_path = store._resolve_path("objects", cid) + os.remove(obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store.find_object(pid) + + +def test_find_object_cid_refs_not_found(pids, store): """Test find_object throws exception when pid refs file is found with a cid but the cid does not exist.""" test_dir = "tests/testdata/" @@ -612,7 +631,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): store.find_object(pid) -def test_find_object_pid_object_does_not_exist(store): +def test_find_object_pid_refs_not_found(store): """Test find object throws exception when object doesn't exist.""" with pytest.raises(PidRefsDoesNotExist): store.find_object("dou.test.1") From 4ebe6faa24a105f4d26ba815668f70a68758c81e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 11:19:52 -0700 Subject: [PATCH 177/420] Add custom exception for when pid refs file exists but the pid is not found in its expected cid refs file, update affected methods and add new pytest --- src/hashstore/filehashstore.py | 33 ++++++++++++++++----------- tests/test_filehashstore_interface.py | 19 +++++++++++++++ 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d5ddb44e..b63ca9b6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -694,7 +694,7 @@ def find_object(self, pid): + f", but is missing from cid refs file: {cid_ref_abs_path}" ) logging.error(err_msg) - raise ValueError(err_msg) + raise PidNotFoundInCidRefsFile(err_msg) else: err_msg = ( f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" @@ -878,18 +878,16 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - except ValueError as ve: - ve_string = str(ve) - if "is missing from cid refs file" in ve_string: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return + except PidNotFoundInCidRefsFile: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return # Proceed with next steps - cid has been retrieved without any issues while cid in self.reference_locked_cids: @@ -2277,3 +2275,12 @@ class RefsFileExistsButCidObjMissing(Exception): def __init__(self, message, errors=None): super().__init__(message) self.errors = errors + + +class PidNotFoundInCidRefsFile(Exception): + """Custom exception thrown when pid reference file exists with a cid, but + the respective cid reference file does not contain the pid.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 98425957..53e40e55 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -11,6 +11,7 @@ from hashstore.filehashstore import ( CidRefsDoesNotExist, + PidNotFoundInCidRefsFile, PidRefsDoesNotExist, RefsFileExistsButCidObjMissing, ) @@ -631,6 +632,24 @@ def test_find_object_cid_refs_not_found(pids, store): store.find_object(pid) +def test_find_object_cid_refs_does_not_contain_pid(pids, store): + """Test find_object throws exception when pid refs file is found with a cid + but the cid refs file does not contain the pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._resolve_path( + "cid", object_metadata.hex_digests.get("sha256") + ) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store.find_object(pid) + + def test_find_object_pid_refs_not_found(store): """Test find object throws exception when object doesn't exist.""" with pytest.raises(PidRefsDoesNotExist): From 854c99fbb99f19bda68216f1199ebfc3a0e55e0f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 12:47:17 -0700 Subject: [PATCH 178/420] Refactor 'delete_object' to improve clarity and include missing file locks/synchronization --- src/hashstore/filehashstore.py | 287 ++++++++++++++++++++++----------- 1 file changed, 190 insertions(+), 97 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b63ca9b6..c4ea3385 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -824,123 +824,216 @@ def delete_object(self, ab_id, id_type=None): cid_refs_abs_path = self._resolve_path("cid", ab_id) # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): - self._delete("objects", ab_id) + cid = ab_id + # Synchronize the cid + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Add cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(ab_id) + + try: + self._delete("objects", ab_id) + finally: + # Release cid + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Removing cid: %s from" + + "reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) else: # id_type is "pid" pid = ab_id - # Create a list of objects to delete to minimize delay objects_to_delete = [] - # Get the metadata documents to delete + # Get the metadata documents to minimize time spent in synchronization rel_path = "/".join(self._shard(self._computehash(pid))) metadata_rel_path = self._get_store_path("metadata") / rel_path metadata_file_paths = self._get_file_paths(metadata_rel_path) - # Add these files to be permanently deleted - if metadata_file_paths is not None: - for path in metadata_file_paths: - # Rename files by appending _delete to the file name - objects_to_delete.append(self._rename_path_for_deletion(path)) - - try: - cid = self.find_object(pid) - except PidRefsDoesNotExist: - warn_msg = ( - "FileHashStore - delete_object: pid refs file does not exist for pid: " - + ab_id - + ". Skipping deletion request." - ) - logging.warning(warn_msg) - # Nothing to delete - return - except CidRefsDoesNotExist: - # Delete pid refs file - objects_to_delete.append( - self._rename_path_for_deletion(self._resolve_path("pid", pid)) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - except RefsFileExistsButCidObjMissing: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove pid from cid refs file - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - # Retrieve the cid - pid_refs_cid = pid_ref_file.read() - cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) - # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "remove") - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - except PidNotFoundInCidRefsFile: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - # Proceed with next steps - cid has been retrieved without any issues - while cid in self.reference_locked_cids: + # Storing and deleting objects are synchronized together + # Duplicate store object requests for a pid are rejected, but deleting an object + # will wait for a pid to be released if it's found to be in use before proceeding. + while pid in self.object_locked_pids: logging.debug( - "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", - cid, + "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", + pid, ) time.sleep(self.time_out_sec) - # Modify reference_locked_cids consecutively - with self.reference_lock: + # Modify object_locked_pids consecutively + with self.object_lock: logging.debug( - "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", - cid, + "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", + pid, ) - self.reference_locked_cids.append(cid) + self.object_locked_pids.append(pid) + try: - cid_ref_abs_path = self._resolve_path("cid", cid) - pid_ref_abs_path = self._resolve_path("pid", pid) - # Add pid refs file to be permanently deleted - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove pid from cid reference file - self._update_refs_file(cid_ref_abs_path, pid, "remove") - # Delete cid reference file and object only if the cid refs file is empty - if os.path.getsize(cid_ref_abs_path) == 0: + # Before we begin deletion process, we look for the `cid` by calling + # `find_object` which will throw custom exceptions if there is an issue with + # the reference files, which help us determine the path to proceed with. + try: + cid = self.find_object(pid) + + # Proceed with next steps - cid has been retrieved without any issues + # We must synchronized here based on the `cid` because multiple threads may + # try to access the `cid_reference_file` + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Add cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(cid) + + try: + cid_ref_abs_path = self._resolve_path("cid", cid) + pid_ref_abs_path = self._resolve_path("pid", pid) + # Add pid refs file to be permanently deleted + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove pid from cid reference file + self._update_refs_file(cid_ref_abs_path, pid, "remove") + # Delete cid reference file and object only if the cid refs file is empty + if os.path.getsize(cid_ref_abs_path) == 0: + objects_to_delete.append( + self._rename_path_for_deletion(cid_ref_abs_path) + ) + obj_real_path = self._resolve_path("objects", cid) + objects_to_delete.append( + self._rename_path_for_deletion(obj_real_path) + ) + # Remove metadata files if they exist + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + + info_string = ( + "FileHashStore - delete_object: Successfully deleted references," + + f" metadata and object associated with pid: {pid}" + ) + logging.info(info_string) + return + + finally: + # Release cid + with self.reference_lock: + debug_msg = ( + "FileHashStore - delete_object:" + + f" Removing cid: {cid} from reference_locked_cids." + ) + logging.debug(debug_msg) + self.reference_locked_cids.remove(cid) + + except PidRefsDoesNotExist: + warn_msg = ( + "FileHashStore - delete_object: pid refs file does not exist for pid: " + + ab_id + + ". Skipping deletion request." + ) + logging.warning(warn_msg) + + # Remove metadata files if they exist + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + + except CidRefsDoesNotExist: + # Delete pid refs file objects_to_delete.append( - self._rename_path_for_deletion(cid_ref_abs_path) + self._rename_path_for_deletion(self._resolve_path("pid", pid)) ) - obj_real_path = self._resolve_path("objects", cid) + # Remove metadata files if they exist + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + + except RefsFileExistsButCidObjMissing: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) objects_to_delete.append( - self._rename_path_for_deletion(obj_real_path) + self._rename_path_for_deletion(pid_ref_abs_path) ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - - info_string = ( - "FileHashStore - delete_object: Successfully deleted references, metadata and" - + f" object associated with pid: {pid}" - ) - logging.info(info_string) - return - + # Remove pid from cid refs file + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + # Retrieve the cid + pid_refs_cid = pid_ref_file.read() + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + # Remove if the pid refs is found + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "remove") + # Remove metadata files if they exist + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + + except PidNotFoundInCidRefsFile: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return finally: - # Release cid - with self.reference_lock: - debug_msg = ( - "FileHashStore - delete_object:" - + f" Removing cid: {cid} from reference_locked_cids." + # Release pid + with self.object_lock: + logging.debug( + "FileHashStore - delete_object: Removing pid: %s from object_locked_pids.", + pid, ) - logging.debug(debug_msg) - self.reference_locked_cids.remove(cid) + self.object_locked_pids.remove(pid) def delete_metadata(self, pid, format_id=None): logging.debug( From 781e6ce05bc59deec825dd6e7e50370362863165 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 12:57:16 -0700 Subject: [PATCH 179/420] Remove redundant methods 'has_subdir' and 'remove_empty' and update pytests --- src/hashstore/filehashstore.py | 32 ------------- tests/test_filehashstore.py | 85 ---------------------------------- 2 files changed, 117 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c4ea3385..1801b2f3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2056,38 +2056,6 @@ def _rename_path_for_deletion(path): shutil.move(path, delete_path) return delete_path - def _remove_empty(self, subpath): - """Successively remove all empty folders starting with `subpath` and - proceeding "up" through directory tree until reaching the `root` - folder. - - :param str subpath: Name of directory. - """ - # Don't attempt to remove any folders if subpath is not a - # subdirectory of the root directory. - if not self._has_subdir(subpath): - return - - while subpath != self.root: - if len(os.listdir(subpath)) > 0 or os.path.islink(subpath): - break - os.rmdir(subpath) - subpath = os.path.dirname(subpath) - - def _has_subdir(self, path): - """Return whether `path` is a subdirectory of the `root` directory. - - :param str path: Name of path. - - :return: `True` if subdirectory. - :rtype: bool - """ - # Append os.sep so that paths like /usr/var2/log doesn't match /usr/var. - root_path = os.path.realpath(self.root) + os.sep - subpath = os.path.realpath(path) - is_subdir = subpath.startswith(root_path) - return is_subdir - def _create_path(self, path): """Physically create the folder path (and all intermediate ones) on disk. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index cd1e9fac..50ae0ee7 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -916,91 +916,6 @@ def test_delete_with_object_metadata_id(pids, store): assert store._count(entity) == 0 -def test_remove_empty_removes_empty_folders_string(store): - """Test empty folders (via string) are removed.""" - three_dirs = "dir1/dir2/dir3" - two_dirs = "dir1/dir4" - one_dir = "dir5" - os.makedirs(os.path.join(store.root, three_dirs)) - os.makedirs(os.path.join(store.root, two_dirs)) - os.makedirs(os.path.join(store.root, one_dir)) - assert os.path.exists(os.path.join(store.root, three_dirs)) - assert os.path.exists(os.path.join(store.root, two_dirs)) - assert os.path.exists(os.path.join(store.root, one_dir)) - # pylint: disable=W0212 - store._remove_empty(os.path.join(store.root, three_dirs)) - store._remove_empty(os.path.join(store.root, two_dirs)) - store._remove_empty(os.path.join(store.root, one_dir)) - assert not os.path.exists(os.path.join(store.root, three_dirs)) - assert not os.path.exists(os.path.join(store.root, two_dirs)) - assert not os.path.exists(os.path.join(store.root, one_dir)) - - -def test_remove_empty_removes_empty_folders_path(store): - """Test empty folders (via Path object) are removed.""" - three_dirs = Path("dir1/dir2/dir3") - two_dirs = Path("dir1/dir4") - one_dir = Path("dir5") - (store.root / three_dirs).mkdir(parents=True) - (store.root / two_dirs).mkdir(parents=True) - (store.root / one_dir).mkdir(parents=True) - assert (store.root / three_dirs).exists() - assert (store.root / two_dirs).exists() - assert (store.root / one_dir).exists() - # pylint: disable=W0212 - store._remove_empty(store.root / three_dirs) - store._remove_empty(store.root / two_dirs) - store._remove_empty(store.root / one_dir) - assert not (store.root / three_dirs).exists() - assert not (store.root / two_dirs).exists() - assert not (store.root / one_dir).exists() - - -def test_remove_empty_does_not_remove_nonempty_folders(pids, store): - """Test non-empty folders are not removed.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store._store_and_validate_data(pid, path) - object_metadata_shard = store._shard(object_metadata.cid) - object_metadata_shard_path = "/".join(object_metadata_shard) - # Get parent directory of the relative path - parent_dir = os.path.dirname(object_metadata_shard_path) - # Attempt to remove the parent directory - # pylint: disable=W0212 - store._remove_empty(parent_dir) - abs_parent_dir = store.objects + "/" + parent_dir - assert os.path.exists(abs_parent_dir) - - -def test_has_subdir_subdirectory_string(store): - """Test that subdirectory is recognized.""" - sub_dir = store.root + "/filehashstore/test" - os.makedirs(sub_dir) - # pylint: disable=W0212 - is_sub_dir = store._has_subdir(sub_dir) - assert is_sub_dir - - -def test_has_subdir_subdirectory_path(store): - """Test that subdirectory is recognized.""" - sub_dir = Path(store.root) / "filehashstore" / "test" - sub_dir.mkdir(parents=True) - # pylint: disable=W0212 - is_sub_dir = store._has_subdir(sub_dir) - assert is_sub_dir - - -def test_has_subdir_non_subdirectory(store): - """Test that non-subdirectory is not recognized.""" - parent_dir = os.path.dirname(store.root) - non_sub_dir = parent_dir + "/filehashstore/test" - os.makedirs(non_sub_dir) - # pylint: disable=W0212 - is_sub_dir = store._has_subdir(non_sub_dir) - assert not is_sub_dir - - def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: From be1e27b7e5632d7a200aff978c3c63fdc3ea2846 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 11:58:33 -0700 Subject: [PATCH 180/420] Fix bug with incorrect stored metadata document name and update all pytests --- src/hashstore/filehashstore.py | 8 ++++---- tests/test_filehashstore.py | 2 +- tests/test_filehashstore_interface.py | 10 +++++----- tests/test_hashstore_client.py | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1801b2f3..baec42cc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -794,9 +794,9 @@ def retrieve_metadata(self, pid, format_id=None): entity = "metadata" metadata_directory = self._computehash(pid) if format_id is None: - metadata_document_name = self._computehash(self.sysmeta_ns) + metadata_document_name = self._computehash(pid + self.sysmeta_ns) else: - metadata_document_name = self._computehash(checked_format_id) + metadata_document_name = self._computehash(pid + checked_format_id) rel_path = "/".join(self._shard(metadata_directory)) full_path_without_directory = rel_path + "/" + metadata_document_name metadata_exists = self._exists(entity, full_path_without_directory) @@ -1065,7 +1065,7 @@ def delete_metadata(self, pid, format_id=None): return else: # Delete a specific metadata file - metadata_document_name = self._computehash(checked_format_id) + metadata_document_name = self._computehash(pid + checked_format_id) full_path_without_directory = rel_path + "/" + metadata_document_name metadata_exists = self._exists(entity, full_path_without_directory) if metadata_exists: @@ -1598,7 +1598,7 @@ def _put_metadata(self, metadata, pid, format_id): # Get target and related paths (permanent location) metadata_directory = self._computehash(pid) - metadata_document_name = self._computehash(format_id) + metadata_document_name = self._computehash(pid + format_id) rel_path = "/".join(self._shard(metadata_directory)) full_path = self._get_store_path("metadata") / rel_path / metadata_document_name diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 50ae0ee7..5c7acdfe 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -664,7 +664,7 @@ def test_put_metadata_cid(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(format_id) + metadata_document_name = store._computehash(pid + format_id) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 53e40e55..af76998a 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -679,7 +679,7 @@ def test_store_metadata(pids, store): metadata_cid = store.store_metadata(pid, syspath, format_id) # Manually calculate expected path metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(format_id) + metadata_document_name = store._computehash(pid + format_id) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name @@ -703,9 +703,9 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): metadata_cid = store.store_metadata(pid, syspath, format_id) metadata_cid3 = store.store_metadata(pid, syspath, format_id3) metadata_cid4 = store.store_metadata(pid, syspath, format_id4) - metadata_document_name = store._computehash(format_id) - metadata_document_name3 = store._computehash(format_id3) - metadata_document_name4 = store._computehash(format_id4) + metadata_document_name = store._computehash(pid + format_id) + metadata_document_name3 = store._computehash(pid + format_id3) + metadata_document_name4 = store._computehash(pid + format_id4) full_path = store._get_store_path("metadata") / rel_path / metadata_document_name full_path3 = store._get_store_path("metadata") / rel_path / metadata_document_name3 full_path4 = store._get_store_path("metadata") / rel_path / metadata_document_name4 @@ -725,7 +725,7 @@ def test_store_metadata_default_format_id(pids, store): metadata_cid = store.store_metadata(pid, syspath) # Manually calculate expected path metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(format_id) + metadata_document_name = store._computehash(pid + format_id) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index af9997ea..f6db7ed5 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -169,7 +169,7 @@ def test_store_metadata(capsys, store, pids): hashstoreclient.main() metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(namespace) + metadata_document_name = store._computehash(pid + namespace) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name From 57af22fb1882c5bfae7615ea92d651b1e33ea5b0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 12:05:39 -0700 Subject: [PATCH 181/420] Remove resolved todo items from 'test_filehashstore' --- tests/test_filehashstore.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 5c7acdfe..4150d800 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -654,7 +654,6 @@ def test_put_metadata_with_string(pids, store): def test_put_metadata_cid(pids, store): """Test put metadata returns correct id.""" - # TODO: Review after fixing put_metadata's permanent address (pid+format_id) test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -960,7 +959,6 @@ def test_get_real_path_with_object_id_sharded(pids, store): def test_get_real_path_with_metadata_id(store, pids): """Test get_real_path returns absolute path given a metadata id.""" - # TODO: Review after fixing put_metadata's permanent address (pid+format_id) entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" From 58d5e0f34637dcfdba2e8d0e8acb33f87a8ee058 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 12:09:12 -0700 Subject: [PATCH 182/420] Update README hashstore layout example --- README.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index cfb4cf25..ae14f610 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ properties = { # Get HashStore from factory module_name = "hashstore.filehashstore.filehashstore" class_name = "FileHashStore" -my_store = factory.get_hashstore(module_name, class_name, properties) +my_store = hashstore_factory.get_hashstore(module_name, class_name, properties) # Store objects (.../[hashstore_path]/objects/) pid = "j.tao.1700.1" @@ -140,21 +140,21 @@ These reference files are implemented in HashStore underneath the hood with no e **'pid' Reference Files** - Pid (persistent identifier) reference files are created when storing an object with an identifier. -- Pid reference files are located in HashStores '/refs/pid' directory +- Pid reference files are located in HashStores '/refs/pids' directory - If an identifier is not available at the time of storing an object, the calling app/client must create this association between a pid and the object it represents by calling `tag_object` separately. - Each pid reference file contains a string that represents the content identifier of the object it references - Like how objects are stored once and only once, there is also only one pid reference file for each object. **'cid' Reference Files** - Cid (content identifier) reference files are created at the same time as pid reference files when storing an object with an identifier. -- Cid reference files are located in HashStore's '/refs/cid' directory +- Cid reference files are located in HashStore's '/refs/cids' directory - A cid reference file is a list of all the pids that reference a cid, delimited by a new line ("\n") character ###### What does HashStore look like? ```shell -# Example layout in HashStore with a single file stored along with its metadata and reference files. +# Example layout in HashStore with three files stored along with its metadata and reference files. # This uses a store depth of 3, with a width of 2 and "SHA-256" as its default store algorithm ## Notes: ## - Objects are stored using their content identifier as the file address @@ -162,14 +162,66 @@ These reference files are implemented in HashStore underneath the hood with no e ## - The reference file for each cid contains multiple pids each on its own line .../metacat/hashstore/ -└─ objects - └─ /d5/95/3b/d802fa74edea72eb941...00d154a727ed7c2 -└─ metadata - └─ /15/8d/7e/55c36a810d7c14479c9...b20d7df66768b04 -└─ refs - └─ pid/0d/55/5e/d77052d7e166017f779...7230bcf7abcef65e - └─ cid/d5/95/3b/d802fa74edea72eb941...00d154a727ed7c2 -hashstore.yaml + ├── hashstore.yaml + ├── objects + | ├── 4d + | │ └── 19 + | │ └── 81 + | | └── 71eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c + | ├── 94 + | │ └── f9 + | │ └── b6 + | | └── c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a + | └── 44 + | └── 73 + | └── 51 + | └── 6a592209cbcd3a7ba4edeebbdb374ee8e4a49d19896fafb8f278dc25fa + └── metadata + | ├── 0d + | │ └── 55 + | │ └── 55 + | | └── 5ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e + | | └── 323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7 + | | └── sha256(pid+formatId_annotations) + | ├── a8 + | │ └── 24 + | │ └── 19 + | | └── 25740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf + | | └── ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689 + | | └── sha256(pid+formatId_annotations) + | └── 7f + | └── 5c + | └── c1 + | └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 + | └── 9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2 + | └── sha256(pid+formatId_annotations) + └── refs + ├── cids + | ├── 4d + | | └── 19 + | | └── 81 + | | └── 71eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c + | ├── 94 + | │ └── f9 + | │ └── b6 + | | └── c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a + | └── 44 + | └── 73 + | └── 51 + | └── 6a592209cbcd3a7ba4edeebbdb374ee8e4a49d19896fafb8f278dc25fa + └── pids + ├── 0d + | └── 55 + | └── 55 + | └── 5ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e + ├── a8 + │ └── 24 + │ └── 19 + | └── 25740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf + └── 7f + └── 5c + └── c1 + └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 ``` ## Development build From 523024e54a1f3c12b8cab742ecd03936481738ff Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 12:51:21 -0700 Subject: [PATCH 183/420] Add new pytests for '_resolve_path' method --- src/hashstore/filehashstore.py | 2 ++ tests/test_filehashstore.py | 66 ++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index baec42cc..5342f415 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2109,6 +2109,8 @@ def _resolve_path(self, entity, file): if entity == "objects": rel_root = self.objects if entity == "metadata": + # TODO: The resolve_path method is not consistent in its usage regarding metadata + # Review and refactor when time permitting. rel_root = self.metadata relpath = os.path.join(rel_root, file) if os.path.isfile(relpath): diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 4150d800..ffd012c1 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1009,3 +1009,69 @@ def test_cast_to_bytes(store): # pylint: disable=W0212 string_bytes = store._cast_to_bytes(string) assert isinstance(string_bytes, bytes) + + +def test_resolve_path_objects(pids, store): + """Confirm resolve path returns correct object path""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + obj_resolved_path = store._resolve_path("objects", cid) + calculated_obj_path = store.objects + "/" + "/".join(store._shard(cid)) + + assert calculated_obj_path == obj_resolved_path + + +def test_resolve_path_metadata(pids, store): + """Confirm resolve path returns correct metadata path.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _metadata_cid = store.store_metadata(pid, syspath, format_id) + + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(pid + format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path_without_dir = rel_path + "/" + metadata_document_name + + metadata_resolved_path = store._resolve_path("metadata", full_path_without_dir) + calculated_metadata_path = ( + store.metadata + "/" + rel_path + "/" + metadata_document_name + ) + + assert calculated_metadata_path == metadata_resolved_path + + +def test_resolve_path_refs_pid(pids, store): + """Confirm resolve path returns correct object pid refs path""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + _object_metadata = store.store_object(pid, path) + + resolved_pid_ref_abs_path = store._resolve_path("pid", pid) + pid_refs_metadata_hashid = store._computehash(pid) + calculated_pid_ref_path = ( + store.refs + "/pid/" + "/".join(store._shard(pid_refs_metadata_hashid)) + ) + + assert resolved_pid_ref_abs_path == calculated_pid_ref_path + + +def test_resolve_path_refs_cid(pids, store): + """Confirm resolve path returns correct object pid refs path""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + resolved_cid_ref_abs_path = store._resolve_path("cid", cid) + calculated_cid_ref_path = store.refs + "/cid/" + "/".join(store._shard(cid)) + + assert resolved_cid_ref_abs_path == calculated_cid_ref_path From 84e00d4e3b3c40e9c3160d32e2507e9fc61623b5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 15:34:55 -0700 Subject: [PATCH 184/420] Revise '_resolve_path' method to improve clarity --- src/hashstore/filehashstore.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5342f415..745fbca9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2100,24 +2100,27 @@ def _resolve_path(self, entity, file): :return: Path to file :rtype: str """ - # Check for absolute path. - if os.path.isfile(file): - return file - # Check for relative path. - rel_root = "" if entity == "objects": rel_root = self.objects - if entity == "metadata": + relpath = os.path.join(rel_root, file) + if os.path.isfile(relpath): + return relpath + else: + abspath = self._build_path(entity, file) + if os.path.isfile(abspath): + return abspath + elif entity == "metadata": # TODO: The resolve_path method is not consistent in its usage regarding metadata # Review and refactor when time permitting. + if os.path.isfile(file): + return file rel_root = self.metadata - relpath = os.path.join(rel_root, file) - if os.path.isfile(relpath): - return relpath - + relpath = os.path.join(rel_root, file) + if os.path.isfile(relpath): + return relpath # Check for sharded path. - if entity == "cid": + elif entity == "cid": # Note, we skip checking whether the file exists for refs cid_ref_file_abs_path = self._build_path(entity, file) return cid_ref_file_abs_path @@ -2127,9 +2130,11 @@ def _resolve_path(self, entity, file): pid_ref_file_abs_path = self._build_path(entity, hash_id) return pid_ref_file_abs_path else: - abspath = self._build_path(entity, file) - if os.path.isfile(abspath): - return abspath + exception_string = ( + "FileHashStore - _resolve_path: entity must be" + + " 'object', 'metadata', 'cid' or 'pid" + ) + raise ValueError(exception_string) def _get_store_path(self, entity): """Return a path object of the root directory of the store. From 4922c247af37e0f52a26e7dd5333f371c8ef7a9e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 16:02:02 -0700 Subject: [PATCH 185/420] Fix typo in exception string for 'resolve_path' and rename variables in 'retrieve_metadata' to improve clarity --- src/hashstore/filehashstore.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 745fbca9..f2065f30 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -798,11 +798,15 @@ def retrieve_metadata(self, pid, format_id=None): else: metadata_document_name = self._computehash(pid + checked_format_id) rel_path = "/".join(self._shard(metadata_directory)) - full_path_without_directory = rel_path + "/" + metadata_document_name - metadata_exists = self._exists(entity, full_path_without_directory) + metadata_rel_path = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, metadata_rel_path) if metadata_exists: - metadata_stream = self._open(entity, full_path_without_directory) + metadata_stream = self._open(entity, metadata_rel_path) + logging.info( + "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid + ) + return metadata_stream else: exception_string = ( f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" @@ -810,11 +814,6 @@ def retrieve_metadata(self, pid, format_id=None): logging.error(exception_string) raise ValueError(exception_string) - logging.info( - "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid - ) - return metadata_stream - def delete_object(self, ab_id, id_type=None): logging.debug( "FileHashStore - delete_object: Request to delete object for id: %s", ab_id @@ -2111,8 +2110,6 @@ def _resolve_path(self, entity, file): if os.path.isfile(abspath): return abspath elif entity == "metadata": - # TODO: The resolve_path method is not consistent in its usage regarding metadata - # Review and refactor when time permitting. if os.path.isfile(file): return file rel_root = self.metadata @@ -2132,7 +2129,7 @@ def _resolve_path(self, entity, file): else: exception_string = ( "FileHashStore - _resolve_path: entity must be" - + " 'object', 'metadata', 'cid' or 'pid" + + " 'objects', 'metadata', 'cid' or 'pid" ) raise ValueError(exception_string) From fa29b53c8f72fb36f13717f1740e13c29fa7ed0c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 16:54:44 -0700 Subject: [PATCH 186/420] Fix inaccurate exception text in '_verify_object_information' --- src/hashstore/filehashstore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f2065f30..fe1ea8e5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1689,7 +1689,7 @@ def _verify_object_information( if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: exception_string = ( - "FileHashStore - _validate_arg_object: Object file size calculated: " + "FileHashStore - _verify_object_information: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" + f" {file_size_to_validate}." ) @@ -1707,7 +1707,7 @@ def _verify_object_information( if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: exception_string = ( - "FileHashStore - _validate_arg_object: checksum_algorithm" + "FileHashStore - _verify_object_information: checksum_algorithm" + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." ) logging.error(exception_string) @@ -1716,7 +1716,7 @@ def _verify_object_information( hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum.lower(): exception_string = ( - "FileHashStore - _validate_arg_object: Hex digest and checksum" + "FileHashStore - _verify_object_information: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + f" HexDigest: {hex_digest_stored}." From b3a612ca88063756e77401e86fdd32e13e9799f0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 16:57:40 -0700 Subject: [PATCH 187/420] Rename 'refs/pid' and 'refs/cid' directories to 'refs/pids' and 'refs/cids' --- src/hashstore/filehashstore.py | 4 ++-- tests/test_filehashstore.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index fe1ea8e5..9ec884b5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -111,8 +111,8 @@ def __init__(self, properties=None): self._create_path(self.metadata + "/tmp") if not os.path.exists(self.refs): self._create_path(self.refs + "/tmp") - self._create_path(self.refs + "/pid") - self._create_path(self.refs + "/cid") + self._create_path(self.refs + "/pids") + self._create_path(self.refs + "/cids") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index ffd012c1..1404832a 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -18,8 +18,8 @@ def test_init_directories_created(store): assert os.path.exists(store.metadata + "/tmp") assert os.path.exists(store.refs) assert os.path.exists(store.refs + "/tmp") - assert os.path.exists(store.refs + "/pid") - assert os.path.exists(store.refs + "/cid") + assert os.path.exists(store.refs + "/pids") + assert os.path.exists(store.refs + "/cids") def test_init_existing_store_incorrect_algorithm_format(store): From f238df56b5afb92bd046e92f862dad93b8c6b4e2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 17:07:31 -0700 Subject: [PATCH 188/420] Improve logging statements for 'find_object' by including pid value --- src/hashstore/filehashstore.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9ec884b5..63ad485c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -691,6 +691,8 @@ def find_object(self, pid): err_msg = ( "FileHashStore - find_object: pid refs file exists with cid: " + pid_refs_cid + + " for pid: " + + pid + f", but is missing from cid refs file: {cid_ref_abs_path}" ) logging.error(err_msg) @@ -698,7 +700,7 @@ def find_object(self, pid): else: err_msg = ( f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" - + f", but cid refs file not found: {cid_ref_abs_path}" + + f", but cid refs file not found: {cid_ref_abs_path} for pid: {pid}" ) logging.error(err_msg) raise CidRefsDoesNotExist(err_msg) From 8418000596d71c87b2570d73d6bb7b887dcc018b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 22 May 2024 09:42:25 -0700 Subject: [PATCH 189/420] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ae14f610..da7aa829 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ tag_object(pid, cid) **How do I delete an object if I have the pid?** - To delete an object and all its associated reference files, call the Public API method `delete_object` with `id_type` 'pid'. - To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it is not referenced by any pids. -- Note, `delete_object` and `tag_object` calls are synchronized on their content identifier values so that the shared reference files are not unintentionally modified concurrently. An object that is in the process of being deleted should not be tagged, and vice versa. These calls have been implemented to occur sequentially to improve clarity in the event of an unexpected conflict or issue. +- Note, `delete_object` and `store_object` are synchronized based on a given 'pid'. An object that is in the process of being stored based on a pid should not be deleted at the same time. Additionally, `delete_object` further synchronizes with `tag_object` based on a `cid`. Every object is stored once, is unique and shares one cid reference file. The API calls to access this cid reference file must be coordinated to prevent file system locking exceptions. ###### Working with metadata (store, retrieve, delete) @@ -126,8 +126,8 @@ HashStore's '/metadata' directory holds all metadata for objects stored in HashS - If there are multiple metadata objects, a 'format_id' must be specified when calling `retrieve_metadata` (ex. `retrieve_metadata(pid, format_id)`) **How do I delete a metadata file?** -- Like `retrieve_metadata`, call the Public API method `delete_metadata` which will delete the metadata object associated with the given pid. -- If there are multiple metadata objects, a 'format_id' must be specified when calling `delete_metadata` to ensure the expected metadata object is deleted. +- Like `retrieve_metadata`, call the Public API method `delete_metadata` to delete all metadata documents associated with the given pid. +- If there are multiple metadata objects, and you wish to only delete one type, a 'format_id' must be specified when calling `delete_metadata(pid, format_id)` to ensure the expected metadata object is deleted. ###### What are HashStore reference files? From 71c81ecb01d7e8d2ed74a3dc8fafcd732ce0c06f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 22 May 2024 12:31:44 -0700 Subject: [PATCH 190/420] Adjust logging level from 'error' to 'warning' in '_verify_object_information' --- src/hashstore/filehashstore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 63ad485c..b52257a4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1701,10 +1701,10 @@ def _verify_object_information( exception_string + f" Tmp file deleted and file not stored for pid: {pid}" ) - logging.error(exception_string_for_pid) + logging.warning(exception_string_for_pid) raise ValueError(exception_string_for_pid) else: - logging.error(exception_string) + logging.warning(exception_string) raise ValueError(exception_string) if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: @@ -1712,7 +1712,7 @@ def _verify_object_information( "FileHashStore - _verify_object_information: checksum_algorithm" + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." ) - logging.error(exception_string) + logging.warning(exception_string) raise KeyError(exception_string) else: hex_digest_stored = hex_digests[checksum_algorithm] @@ -1729,14 +1729,14 @@ def _verify_object_information( exception_string_for_pid = ( exception_string + f" Tmp file ({tmp_file_name}) deleted." ) - logging.error(exception_string_for_pid) + logging.warning(exception_string_for_pid) raise ValueError(exception_string_for_pid) else: # Delete the object cid = hex_digests[self.algorithm] cid_abs_path = self._resolve_path("cid", cid) self._delete(entity, cid_abs_path) - logging.error(exception_string) + logging.warning(exception_string) raise ValueError(exception_string) def _verify_hashstore_references(self, pid, cid, additional_log_string): From d6b40792fc6a6bb8d8865891e4bc59a9cc483d66 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 22 May 2024 13:57:14 -0700 Subject: [PATCH 191/420] Add new custom exception 'PidObjectMetadataError', revise logging levels, update 'store_object' to accurately relay cause of error, and update pytest --- src/hashstore/filehashstore.py | 37 +++++++++++++++++++-------- tests/test_filehashstore_interface.py | 3 ++- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b52257a4..89974d47 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -477,10 +477,18 @@ def store_object( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, ) + except ObjectMetadataError as ome: + # Note, using '.__cause__' allows the original exception msg to be displayed + exception_string = ( + f"FileHashStore - store_object: failed to store object for pid: {pid}." + + f" Reference files will not be created or tagged. {ome.__cause__}" + ) + logging.error(exception_string) + raise ome except Exception as err: exception_string = ( f"FileHashStore - store_object: failed to store object for pid: {pid}." - + f" Unexpected {err=}, {type(err)=}" + + f" Unexpected error: {err.__cause__}" ) logging.error(exception_string) raise err @@ -1343,11 +1351,11 @@ def _move_and_get_checksums( except ValueError as ve: # If any exception is thrown during validation, exception_string = ( - "FileHashStore - _move_and_get_checksums: Object exists but cannot be verified" - + f" (validation error): {abs_file_path}, deleting temporary file. Error: {ve}" + f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" + + " , deleting temp file. Ref files will not be created/tagged." ) - logging.error(exception_string) - raise ValueError from ve + logging.warning(exception_string) + raise PidObjectMetadataError(exception_string) from ve finally: # Delete the temporary file, it already exists so it is redundant self._delete(entity, tmp_file_name) @@ -1701,10 +1709,10 @@ def _verify_object_information( exception_string + f" Tmp file deleted and file not stored for pid: {pid}" ) - logging.warning(exception_string_for_pid) + logging.debug(exception_string_for_pid) raise ValueError(exception_string_for_pid) else: - logging.warning(exception_string) + logging.debug(exception_string) raise ValueError(exception_string) if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: @@ -1712,7 +1720,7 @@ def _verify_object_information( "FileHashStore - _verify_object_information: checksum_algorithm" + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." ) - logging.warning(exception_string) + logging.debug(exception_string) raise KeyError(exception_string) else: hex_digest_stored = hex_digests[checksum_algorithm] @@ -1729,14 +1737,14 @@ def _verify_object_information( exception_string_for_pid = ( exception_string + f" Tmp file ({tmp_file_name}) deleted." ) - logging.warning(exception_string_for_pid) + logging.debug(exception_string_for_pid) raise ValueError(exception_string_for_pid) else: # Delete the object cid = hex_digests[self.algorithm] cid_abs_path = self._resolve_path("cid", cid) self._delete(entity, cid_abs_path) - logging.warning(exception_string) + logging.debug(exception_string) raise ValueError(exception_string) def _verify_hashstore_references(self, pid, cid, additional_log_string): @@ -2319,6 +2327,15 @@ def close(self): self._obj.seek(self._pos) +class PidObjectMetadataError(Exception): + """Custom exception thrown when an object cannot be verified due + to an error with the metadata provided to validate against.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + class PidRefsDoesNotExist(Exception): """Custom exception thrown when a pid refs file does not exist.""" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index af76998a..54b24788 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -11,6 +11,7 @@ from hashstore.filehashstore import ( CidRefsDoesNotExist, + PidObjectMetadataError, PidNotFoundInCidRefsFile, PidRefsDoesNotExist, RefsFileExistsButCidObjMissing, @@ -420,7 +421,7 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor # Store first blob _object_metadata_one = store.store_object(pid, path) # Store second blob - with pytest.raises(ValueError): + with pytest.raises(PidObjectMetadataError): _object_metadata_two = store.store_object( pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) From bf87fc6d3222dd64b6a7fe5ca36d5f78f5704548 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 22 May 2024 14:00:39 -0700 Subject: [PATCH 192/420] Fix typo in exception caught with correct name after revision --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 89974d47..dff0a70d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -477,7 +477,7 @@ def store_object( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, ) - except ObjectMetadataError as ome: + except PidObjectMetadataError as ome: # Note, using '.__cause__' allows the original exception msg to be displayed exception_string = ( f"FileHashStore - store_object: failed to store object for pid: {pid}." From a9d17d0e6673e48b6709a76415bdfaaada4b23f5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 22 May 2024 14:22:21 -0700 Subject: [PATCH 193/420] Improve logging messaging and revise logging levels in '_move_and_get_checksums' --- src/hashstore/filehashstore.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index dff0a70d..7da86b57 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1298,11 +1298,11 @@ def _move_and_get_checksums( # Revert storage process exception_string = ( "FileHashStore - _move_and_get_checksums:" - + f" Unexpected {err=}, {type(err)=}" + + f" Unexpected Error: {err}" ) - logging.error(exception_string) + logging.warning(exception_string) if os.path.isfile(abs_file_path): - # Check to see if object has moved successfully before deleting + # Check to see if object exists before determining whether to delete debug_msg = ( "FileHashStore - _move_and_get_checksums: Permanent file" + f" found during exception, checking hex digest for pid: {pid}" @@ -1312,28 +1312,34 @@ def _move_and_get_checksums( if pid_checksum == hex_digests.get(self.algorithm): # If the checksums match, return and log warning exception_string = ( - "FileHashStore - _move_and_get_checksums: File moved" - + f" successfully but unexpected issue encountered: {exception_string}", + "FileHashStore - _move_and_get_checksums: Object exists at:" + + f" {abs_file_path} but an unexpected issue has been encountered." + + " Reference files will not be created and/or tagged." ) - logging.error(exception_string) + logging.warning(exception_string) raise err else: debug_msg = ( - "FileHashStore - _move_and_get_checksums: Permanent file" - + f" found but with incomplete state, deleting file: {abs_file_path}", + "FileHashStore - _move_and_get_checksums: Object exists at" + + f"{abs_file_path} but the pid object checksum provided does not" + + " match what has been calculated. Deleting object. References will" + + " not be created and/or tagged.", ) logging.debug(debug_msg) self._delete(entity, abs_file_path) + raise err + logging.debug( "FileHashStore - _move_and_get_checksums: Deleting temporary file: %s", tmp_file_name, ) self._delete(entity, tmp_file_name) err_msg = ( - "Aborting store_object upload - an unexpected error has occurred when moving" - + f" file to: {object_cid} - Error: {err}" + f"Object has not been stored for pid: {pid} - an unexpected error has occurred" + + f" when moving tmp file to: {object_cid}. Reference files will not be" + + f" created and/or tagged. Error: {err}" ) - logging.error("FileHashStore - _move_and_get_checksums: %s", err_msg) + logging.warning("FileHashStore - _move_and_get_checksums: %s", err_msg) raise else: # If the file exists, determine if the object is what the client states it to be @@ -1352,12 +1358,14 @@ def _move_and_get_checksums( # If any exception is thrown during validation, exception_string = ( f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" - + " , deleting temp file. Ref files will not be created/tagged." + + " , deleting temp file. Reference files will not be created and/or tagged" + + " due to an issue with the supplied pid object metadata." ) logging.warning(exception_string) raise PidObjectMetadataError(exception_string) from ve finally: # Delete the temporary file, it already exists so it is redundant + # No exception is thrown so 'store_object' can proceed to tag object self._delete(entity, tmp_file_name) return object_cid, tmp_file_size, hex_digests From 7231791edc88386d762b905be17794e43fe4d4eb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 22 May 2024 14:27:12 -0700 Subject: [PATCH 194/420] Revise 'store_object' exception messaging --- src/hashstore/filehashstore.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7da86b57..e9f437b0 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -481,14 +481,16 @@ def store_object( # Note, using '.__cause__' allows the original exception msg to be displayed exception_string = ( f"FileHashStore - store_object: failed to store object for pid: {pid}." - + f" Reference files will not be created or tagged. {ome.__cause__}" + + " Reference files will not be created or tagged. PidObjectMetadataError:" + + ome.__cause__ ) logging.error(exception_string) raise ome except Exception as err: exception_string = ( f"FileHashStore - store_object: failed to store object for pid: {pid}." - + f" Unexpected error: {err.__cause__}" + + " Reference files will not be created or tagged. Unexpected error: " + + err.__cause__ ) logging.error(exception_string) raise err From 1231cbc4f66de48aeaeb1415b213eb59140daf6e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 22 May 2024 14:56:43 -0700 Subject: [PATCH 195/420] Further revise logging levels and update 'store_object' exception messaging format --- src/hashstore/filehashstore.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e9f437b0..47aa548d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,11 +478,10 @@ def store_object( pid, ) except PidObjectMetadataError as ome: - # Note, using '.__cause__' allows the original exception msg to be displayed exception_string = ( f"FileHashStore - store_object: failed to store object for pid: {pid}." - + " Reference files will not be created or tagged. PidObjectMetadataError:" - + ome.__cause__ + + " Reference files will not be created or tagged. PidObjectMetadataError: " + + str(ome) ) logging.error(exception_string) raise ome @@ -490,7 +489,7 @@ def store_object( exception_string = ( f"FileHashStore - store_object: failed to store object for pid: {pid}." + " Reference files will not be created or tagged. Unexpected error: " - + err.__cause__ + + str(err) ) logging.error(exception_string) raise err @@ -1361,9 +1360,9 @@ def _move_and_get_checksums( exception_string = ( f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" + " , deleting temp file. Reference files will not be created and/or tagged" - + " due to an issue with the supplied pid object metadata." + + f" due to an issue with the supplied pid object metadata. {ve}" ) - logging.warning(exception_string) + logging.debug(exception_string) raise PidObjectMetadataError(exception_string) from ve finally: # Delete the temporary file, it already exists so it is redundant From aa4069adc271f9dffc772a6d8a2c2c0158dcf83a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 23 May 2024 11:43:18 -0700 Subject: [PATCH 196/420] Fix bug with incorrectly storing refs files by resolving hardcoded values with class variable --- src/hashstore/filehashstore.py | 10 ++++++---- tests/test_filehashstore.py | 4 ++-- tests/test_filehashstore_interface.py | 10 ++++++---- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 47aa548d..00bd1af3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -105,6 +105,8 @@ def __init__(self, properties=None): self.objects = self.root + "/objects" self.metadata = self.root + "/metadata" self.refs = self.root + "/refs" + self.cids = self.refs + "/cids" + self.pids = self.refs + "/pids" if not os.path.exists(self.objects): self._create_path(self.objects + "/tmp") if not os.path.exists(self.metadata): @@ -2167,9 +2169,9 @@ def _get_store_path(self, entity): elif entity == "refs": return Path(self.refs) elif entity == "cid": - return Path(self.refs) / "cid" + return Path(self.cids) elif entity == "pid": - return Path(self.refs) / "pid" + return Path(self.pids) else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" @@ -2210,9 +2212,9 @@ def _count(self, entity): elif entity == "metadata": directory_to_count = self.metadata elif entity == "pid": - directory_to_count = self.refs + "/pid" + directory_to_count = self.pids elif entity == "cid": - directory_to_count = self.refs + "/cid" + directory_to_count = self.cids else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1404832a..88dec6a7 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1057,7 +1057,7 @@ def test_resolve_path_refs_pid(pids, store): resolved_pid_ref_abs_path = store._resolve_path("pid", pid) pid_refs_metadata_hashid = store._computehash(pid) calculated_pid_ref_path = ( - store.refs + "/pid/" + "/".join(store._shard(pid_refs_metadata_hashid)) + store.pids + "/" + "/".join(store._shard(pid_refs_metadata_hashid)) ) assert resolved_pid_ref_abs_path == calculated_pid_ref_path @@ -1072,6 +1072,6 @@ def test_resolve_path_refs_cid(pids, store): cid = object_metadata.cid resolved_cid_ref_abs_path = store._resolve_path("cid", cid) - calculated_cid_ref_path = store.refs + "/cid/" + "/".join(store._shard(cid)) + calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid)) assert resolved_cid_ref_abs_path == calculated_cid_ref_path diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 54b24788..7d86fae5 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1243,15 +1243,17 @@ def test_store_and_delete_objects_100_pids_1_cid(store): for i in range(1, upper_limit): pid_modified = f"dou.test.{str(i)}" store.store_object(pid_modified, path) - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 100 - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 1 + assert ( + sum([len(files) for _, _, files in os.walk(store.root + "/refs/pids")]) == 100 + ) + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cids")]) == 1 assert store._count("objects") == 1 # Delete for i in range(1, upper_limit): pid_modified = f"dou.test.{str(i)}" store.delete_object(pid_modified) - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pids")]) == 0 + assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cids")]) == 0 assert store._count("objects") == 0 From 7d3ecd6a374414cddc1f0f8e1bf0afbcb50ffe8e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 23 May 2024 12:57:58 -0700 Subject: [PATCH 197/420] Fix typo in '_load_properties' signature arg --- src/hashstore/filehashstore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 00bd1af3..9e5bdfdf 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -130,7 +130,7 @@ def __init__(self, properties=None): # Configuration and Related Methods @staticmethod - def _load_properties(hahstore_yaml_path, hashstore_required_prop_keys): + def _load_properties(hashstore_yaml_path, hashstore_required_prop_keys): """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): @@ -140,7 +140,7 @@ def _load_properties(hahstore_yaml_path, hashstore_required_prop_keys): - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. :rtype: dict """ - if not os.path.exists(hahstore_yaml_path): + if not os.path.exists(hashstore_yaml_path): exception_string = ( "FileHashStore - load_properties: hashstore.yaml not found" + " in store root path." @@ -149,7 +149,7 @@ def _load_properties(hahstore_yaml_path, hashstore_required_prop_keys): raise FileNotFoundError(exception_string) # Open file - with open(hahstore_yaml_path, "r", encoding="utf-8") as hs_yaml_file: + with open(hashstore_yaml_path, "r", encoding="utf-8") as hs_yaml_file: yaml_data = yaml.safe_load(hs_yaml_file) # Get hashstore properties From 63ad53c34b950b84b3b5dcf0a0d9bc6dcb65273e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 23 May 2024 13:05:03 -0700 Subject: [PATCH 198/420] Fix bug in 'hashstoreclient' where formatid is always 'None' if no formatid is passed, causing metadata document names created via the client to be incorrect --- src/hashstore/hashstoreclient.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index c4e26474..535c84f3 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -1,4 +1,5 @@ """HashStore Command Line App""" + import logging import os from argparse import ArgumentParser @@ -741,6 +742,13 @@ def main(): f"Missing config file (hashstore.yaml) at store path: {store_path}." + " HashStore must first be initialized, use `--help` for more information." ) + else: + # Get the default format_id for sysmeta + with open(store_path_config_yaml, "r", encoding="utf-8") as hs_yaml_file: + yaml_data = yaml.safe_load(hs_yaml_file) + + default_formatid = yaml_data["store_metadata_namespace"] + # Setup logging, create log file if it doesn't already exist hashstore_py_log = store_path + "/python_client.log" python_log_file_path = Path(hashstore_py_log) @@ -768,6 +776,8 @@ def main(): checksum_algorithm = getattr(args, "object_checksum_algorithm") size = getattr(args, "object_size") formatid = getattr(args, "object_formatid") + if formatid is None: + formatid = default_formatid knbvm_test = getattr(args, "knbvm_flag") # Instantiate HashStore Client props = parser.load_store_properties(store_path_config_yaml) From fcec590ef789c0ffcf211711b199fb9cb17d0c16 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 23 May 2024 13:42:12 -0700 Subject: [PATCH 199/420] Add addition logging statements to 'tag_object', 'find_object' and '_update_refs_file' --- src/hashstore/filehashstore.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9e5bdfdf..988cf22e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -602,6 +602,12 @@ def tag_object(self, pid, cid): self._verify_hashstore_references( pid, cid, "Created missing cid refs file" ) + info_msg = ( + f"FileHashStore - tag_object: pid refs file exists for pid: {pid}" + + f", with the expected cid: {cid} - but cid refs file is missing." + + " Cid refs file created, tagged and verified." + ) + logging.info(info_msg) return True else: # Check if the retrieved cid refs file exists and pid is referenced @@ -694,6 +700,7 @@ def find_object(self, pid): + pid_ref_abs_path + f", but object referenced does not exist, cid: {pid_refs_cid}" ) + logging.error(err_msg) raise RefsFileExistsButCidObjMissing(err_msg) else: return pid_refs_cid @@ -720,6 +727,7 @@ def find_object(self, pid): f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " + pid_ref_abs_path ) + logging.error(err_msg) raise PidRefsDoesNotExist(err_msg) def store_metadata(self, pid, metadata, format_id=None): @@ -1571,6 +1579,11 @@ def _update_refs_file(self, refs_file_path, ref_id, update_type): ref_file.seek(0) ref_file.writelines(new_pid_lines) ref_file.truncate() + debug_msg = ( + f"FileHashStore - _update_refs_file: Update ({update_type}) for ref_id: {ref_id}" + + f" completed on refs file: {refs_file_path}." + ) + logging.debug(debug_msg) except Exception as err: exception_string = ( f"FileHashStore - _update_refs_file: failed to {update_type} for ref_id: {ref_id}" From a520a5f16c5d9c97861a20fad8d68a3b28174a1b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 24 May 2024 10:16:51 -0700 Subject: [PATCH 200/420] Fix inaccurate logging msg in 'delete_object' and typos --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 988cf22e..f2c6ec84 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -554,7 +554,7 @@ def verify_object( def tag_object(self, pid, cid): logging.debug( - "FileHashStore - tag_object: Tagging object cid: {%s} with pid: {%s}.", + "FileHashStore - tag_object: Tagging object cid: %s with pid: %s.", cid, pid, ) @@ -892,7 +892,7 @@ def delete_object(self, ab_id, id_type=None): # Modify object_locked_pids consecutively with self.object_lock: logging.debug( - "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", + "FileHashStore - delete_object: Adding pid: %s to object_locked_pids.", pid, ) self.object_locked_pids.append(pid) From 3368bec0f4dd95fa46a8bf94a9ee4d98500f1d95 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 24 May 2024 13:54:11 -0700 Subject: [PATCH 201/420] Further revise and add logging statements to 'store_object' and 'tag_object' process and related methods --- src/hashstore/filehashstore.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f2c6ec84..1708d860 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -474,6 +474,10 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) + logging.debug( + "FileHashStore - store_object: Attempting to tag object for pid: %s", + pid, + ) self.tag_object(pid, object_metadata.cid) logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", @@ -589,6 +593,12 @@ def tag_object(self, pid, cid): ) return True elif pid_ref_abs_path_exists and not cid_ref_abs_path_exists: + debug_msg = ( + f"FileHashStore - tag_object: pid refs file exists ({pid_ref_abs_path})" + + f" for pid: {pid}, but cid refs file doesn't at: {cid_ref_abs_path}" + + f" for cid: {cid}" + ) + logging.debug(debug_msg) # A pid reference file can only contain and reference one cid # First, confirm that the expected cid refs file exists by getting the cid with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: @@ -629,6 +639,11 @@ def tag_object(self, pid, cid): # but doesn't contain the cid. Proceed to overwrite the pid refs file. # There is no return statement, so we move out of this if block. elif not pid_ref_abs_path_exists and cid_ref_abs_path_exists: + debug_msg = ( + f"FileHashStore - tag_object: pid refs file does not exists for pid {pid}" + + f" but cid refs file exists at: {cid_ref_abs_path} for cid: {cid}" + ) + logging.debug(debug_msg) # Create the pid refs file pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") self._create_path(os.path.dirname(pid_ref_abs_path)) @@ -660,7 +675,8 @@ def tag_object(self, pid, cid): shutil.move(cid_tmp_file_path, cid_ref_abs_path) # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review - self._verify_hashstore_references(pid, cid, "Created all refs files") + log_msg = "Reference files have been moved to their permanent location." + self._verify_hashstore_references(pid, cid, log_msg) logging.info( "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", cid, @@ -932,6 +948,11 @@ def delete_object(self, ab_id, id_type=None): self._update_refs_file(cid_ref_abs_path, pid, "remove") # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: + debug_msg = ( + "FileHashStore - delete_object: cid_refs_file is empty (size == 0):" + + f" {cid_ref_abs_path} - deleting cid refs file and data object." + ) + logging.debug(debug_msg) objects_to_delete.append( self._rename_path_for_deletion(cid_ref_abs_path) ) @@ -1507,7 +1528,7 @@ def _write_refs_file(self, path, ref_id, ref_type): difference being that a cid reference file can potentially contain multiple lines of `pid`s that reference the `cid`. - :param str path: Directory to write the temporary file + :param str path: Directory to write a temporary file into :param str ref_id: Authority-based, persistent or content identifier :param str ref_type: 'cid' or 'pid' @@ -1515,7 +1536,7 @@ def _write_refs_file(self, path, ref_id, ref_type): :rtype: string """ logging.debug( - "FileHashStore - write_cid_refs_file: Writing id (%s) into file: %s", + "FileHashStore - _write_refs_file: Writing id (%s) into a tmp file in: %s", ref_id, path, ) @@ -1779,6 +1800,11 @@ def _verify_hashstore_references(self, pid, cid, additional_log_string): :param str cid: Content identifier. :param str additional_log_string: String to append to exception statement """ + debug_msg = ( + f"FileHashStore - _verify_hashstore_references: verifying pid ({pid})" + + f" and cid ({cid}) refs files. Additional Note: {additional_log_string}" + ) + logging.debug(debug_msg) # Check that reference files were created pid_ref_abs_path = self._resolve_path("pid", pid) cid_ref_abs_path = self._resolve_path("cid", cid) From 2db9a8c1a2373d536b56a5e53612558ab394090b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 24 May 2024 16:07:42 -0700 Subject: [PATCH 202/420] Revise debug statement for 'tag_object' --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1708d860..d422de3a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -574,8 +574,8 @@ def tag_object(self, pid, cid): # Modify reference_locked_cids consecutively with self.reference_lock: logging.debug( - "FileHashStore - tag_object: Adding cid: %s to reference_locked_cids.", - cid, + "FileHashStore - tag_object: Locking cid: %s to to tag pid: %s.", + cid, pid ) self.reference_locked_cids.append(cid) try: From af4e7b713e1bd88edd37bb8c415632602a168807 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 24 May 2024 16:10:03 -0700 Subject: [PATCH 203/420] Rename variables in 'tag_object' and update debugging message --- src/hashstore/filehashstore.py | 41 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d422de3a..64ca52ec 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -575,15 +575,16 @@ def tag_object(self, pid, cid): with self.reference_lock: logging.debug( "FileHashStore - tag_object: Locking cid: %s to to tag pid: %s.", - cid, pid + cid, + pid, ) self.reference_locked_cids.append(cid) try: tmp_root_path = self._get_store_path("refs") / "tmp" - pid_ref_abs_path = self._resolve_path("pid", pid) - cid_ref_abs_path = self._resolve_path("cid", cid) - pid_ref_abs_path_exists = os.path.exists(pid_ref_abs_path) - cid_ref_abs_path_exists = os.path.exists(cid_ref_abs_path) + pid_refs_path = self._resolve_path("pid", pid) + cid_refs_path = self._resolve_path("cid", cid) + pid_ref_abs_path_exists = os.path.exists(pid_refs_path) + cid_ref_abs_path_exists = os.path.exists(cid_refs_path) if pid_ref_abs_path_exists and cid_ref_abs_path_exists: self._verify_hashstore_references( @@ -594,21 +595,21 @@ def tag_object(self, pid, cid): return True elif pid_ref_abs_path_exists and not cid_ref_abs_path_exists: debug_msg = ( - f"FileHashStore - tag_object: pid refs file exists ({pid_ref_abs_path})" - + f" for pid: {pid}, but cid refs file doesn't at: {cid_ref_abs_path}" + f"FileHashStore - tag_object: pid refs file exists ({pid_refs_path})" + + f" for pid: {pid}, but cid refs file doesn't at: {cid_refs_path}" + f" for cid: {cid}" ) logging.debug(debug_msg) # A pid reference file can only contain and reference one cid # First, confirm that the expected cid refs file exists by getting the cid - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + with open(pid_refs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() - if self._is_string_in_refs_file(cid, pid_ref_abs_path): + if self._is_string_in_refs_file(cid, pid_refs_path): # The pid correctly references the given cid, but the cid refs file is missing cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - self._create_path(os.path.dirname(cid_ref_abs_path)) - shutil.move(cid_tmp_file_path, cid_ref_abs_path) + self._create_path(os.path.dirname(cid_refs_path)) + shutil.move(cid_tmp_file_path, cid_refs_path) self._verify_hashstore_references( pid, cid, "Created missing cid refs file" ) @@ -641,16 +642,16 @@ def tag_object(self, pid, cid): elif not pid_ref_abs_path_exists and cid_ref_abs_path_exists: debug_msg = ( f"FileHashStore - tag_object: pid refs file does not exists for pid {pid}" - + f" but cid refs file exists at: {cid_ref_abs_path} for cid: {cid}" + + f" but cid refs file exists at: {cid_refs_path} for cid: {cid}" ) logging.debug(debug_msg) # Create the pid refs file pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - self._create_path(os.path.dirname(pid_ref_abs_path)) - shutil.move(pid_tmp_file_path, pid_ref_abs_path) + self._create_path(os.path.dirname(pid_refs_path)) + shutil.move(pid_tmp_file_path, pid_refs_path) # Update cid ref files as it already exists - if not self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "add") + if not self._is_string_in_refs_file(pid, cid_refs_path): + self._update_refs_file(cid_refs_path, pid, "add") self._verify_hashstore_references( pid, cid, @@ -668,11 +669,11 @@ def tag_object(self, pid, cid): pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_ref_abs_path)) - self._create_path(os.path.dirname(cid_ref_abs_path)) + self._create_path(os.path.dirname(pid_refs_path)) + self._create_path(os.path.dirname(cid_refs_path)) # Move both files - shutil.move(pid_tmp_file_path, pid_ref_abs_path) - shutil.move(cid_tmp_file_path, cid_ref_abs_path) + shutil.move(pid_tmp_file_path, pid_refs_path) + shutil.move(cid_tmp_file_path, cid_refs_path) # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review log_msg = "Reference files have been moved to their permanent location." From fffa0e52f03d11f05687194419bdf5ae90169028 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 24 May 2024 16:43:24 -0700 Subject: [PATCH 204/420] Revise '_verify_hashstore_references' method and update pytests --- src/hashstore/filehashstore.py | 43 ++++++++++++++++++-------- tests/test_filehashstore_references.py | 12 ++++--- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 64ca52ec..7c78cbc9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -590,6 +590,8 @@ def tag_object(self, pid, cid): self._verify_hashstore_references( pid, cid, + pid_refs_path, + cid_refs_path, "Refs file already exists, verifying.", ) return True @@ -611,7 +613,11 @@ def tag_object(self, pid, cid): self._create_path(os.path.dirname(cid_refs_path)) shutil.move(cid_tmp_file_path, cid_refs_path) self._verify_hashstore_references( - pid, cid, "Created missing cid refs file" + pid, + cid, + pid_refs_path, + cid_refs_path, + "Created missing cid refs file", ) info_msg = ( f"FileHashStore - tag_object: pid refs file exists for pid: {pid}" @@ -655,6 +661,8 @@ def tag_object(self, pid, cid): self._verify_hashstore_references( pid, cid, + pid_refs_path, + cid_refs_path, "Pid refs file doesn't exist, but cid refs exists.", ) logging.info( @@ -677,7 +685,9 @@ def tag_object(self, pid, cid): # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review log_msg = "Reference files have been moved to their permanent location." - self._verify_hashstore_references(pid, cid, log_msg) + self._verify_hashstore_references( + pid, cid, pid_refs_path, cid_refs_path, log_msg + ) logging.info( "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", cid, @@ -1793,12 +1803,16 @@ def _verify_object_information( logging.debug(exception_string) raise ValueError(exception_string) - def _verify_hashstore_references(self, pid, cid, additional_log_string): + def _verify_hashstore_references( + self, pid, cid, pid_refs_path=None, cid_refs_path=None, additional_log_string=None + ): """Verifies that the supplied pid and pid reference file and content have been written successfully. :param str pid: Authority-based or persistent identifier. :param str cid: Content identifier. + :param str pid_refs_path: Path to pid refs file + :param str cid_refs_path: Path to cid refs file :param str additional_log_string: String to append to exception statement """ debug_msg = ( @@ -1806,43 +1820,46 @@ def _verify_hashstore_references(self, pid, cid, additional_log_string): + f" and cid ({cid}) refs files. Additional Note: {additional_log_string}" ) logging.debug(debug_msg) + if pid_refs_path is None: + pid_refs_path = self._resolve_path("pid", pid) + if cid_refs_path is None: + cid_refs_path = self._resolve_path("cid", cid) + # Check that reference files were created - pid_ref_abs_path = self._resolve_path("pid", pid) - cid_ref_abs_path = self._resolve_path("cid", cid) - if not os.path.exists(pid_ref_abs_path): + if not os.path.exists(pid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " - + pid_ref_abs_path + + pid_refs_path + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) - if not os.path.exists(cid_ref_abs_path): + if not os.path.exists(cid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " - + cid_ref_abs_path + + cid_refs_path + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) # Check the content of the reference files # Start with the cid - with open(pid_ref_abs_path, "r", encoding="utf8") as f: + with open(pid_refs_path, "r", encoding="utf8") as f: retrieved_cid = f.read() if retrieved_cid != cid: exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file exists" - + f" ({pid_ref_abs_path}) but cid ({cid}) does not match." + + f" ({pid_refs_path}) but cid ({cid}) does not match." + f" Additional Context: {additional_log_string}" ) logging.error(exception_string) raise ValueError(exception_string) # Then the pid - pid_found = self._is_string_in_refs_file(pid, cid_ref_abs_path) + pid_found = self._is_string_in_refs_file(pid, cid_refs_path) if not pid_found: exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" - + f" ({cid_ref_abs_path}) but pid ({pid}) not found." + + f" ({cid_refs_path}) but pid ({pid}) not found." + f" Additional Context: {additional_log_string}" ) logging.error(exception_string) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 87a30f26..545a537f 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -360,7 +360,7 @@ def test_verify_hashstore_references_pid_refs_file_missing(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] with pytest.raises(FileNotFoundError): - store._verify_hashstore_references(pid, cid, "create") + store._verify_hashstore_references(pid, cid) def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): @@ -371,17 +371,19 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") cid_ref_abs_path = store._resolve_path("cid", cid) + print(cid_ref_abs_path) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Write the pid refs file and move it where it needs to be with a bad cid pid_ref_abs_path = store._resolve_path("pid", pid) + print(pid_ref_abs_path) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(ValueError): - store._verify_hashstore_references(pid, cid, "create") + store._verify_hashstore_references(pid, cid) def test_verify_hashstore_references_cid_refs_file_missing(pids, store): @@ -395,7 +397,7 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(FileNotFoundError): - store._verify_hashstore_references(pid, cid, "create") + store._verify_hashstore_references(pid, cid) def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): @@ -417,7 +419,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(ValueError): - store._verify_hashstore_references(pid, cid, "create") + store._verify_hashstore_references(pid, cid) def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pid( @@ -446,4 +448,4 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi cid_reference_list.append(f"dou.test.{i}") with pytest.raises(ValueError): - store._verify_hashstore_references(pid, cid, "create") + store._verify_hashstore_references(pid, cid) From 6521b1034dfafff32ab06918fe80795d374b08fb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 24 May 2024 16:51:08 -0700 Subject: [PATCH 205/420] Test: add while block to 'store_object' to wait before calling 'tag_object' if the cid is currently in use --- src/hashstore/filehashstore.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7c78cbc9..53a77e88 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,7 +478,15 @@ def store_object( "FileHashStore - store_object: Attempting to tag object for pid: %s", pid, ) - self.tag_object(pid, object_metadata.cid) + cid = object_metadata.cid + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - store_object: Waiting to tag pid (%s) with cid (%s)", + pid, + cid, + ) + time.sleep(self.time_out_sec) + self.tag_object(pid, cid) logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, @@ -1804,7 +1812,12 @@ def _verify_object_information( raise ValueError(exception_string) def _verify_hashstore_references( - self, pid, cid, pid_refs_path=None, cid_refs_path=None, additional_log_string=None + self, + pid, + cid, + pid_refs_path=None, + cid_refs_path=None, + additional_log_string=None, ): """Verifies that the supplied pid and pid reference file and content have been written successfully. From a68a2fac0f559f756713e90d4b03456b0aabae62 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 27 May 2024 08:49:58 -0700 Subject: [PATCH 206/420] Revert 'tag_object' changes --- src/hashstore/filehashstore.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 53a77e88..e9d1ac7c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -479,13 +479,6 @@ def store_object( pid, ) cid = object_metadata.cid - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - store_object: Waiting to tag pid (%s) with cid (%s)", - pid, - cid, - ) - time.sleep(self.time_out_sec) self.tag_object(pid, cid) logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", From 13a4b45dc699963cf339c4dca56cb06dffa408f3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 27 May 2024 11:45:15 -0700 Subject: [PATCH 207/420] Add new pytest in 'filehashstore_interface' test module to check python threads with multiple pids for one cid --- tests/test_filehashstore_interface.py | 47 +++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 7d86fae5..5511d11b 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -497,6 +497,53 @@ def store_object_wrapper(obj_pid, obj_path): assert store._exists(entity, pids[pid][store.algorithm]) +def test_store_object_threads_multiple_pids_one_cid(pids, store): + """Test store object thread lock.""" + entity = "objects" + test_dir = "tests/testdata/" + path = test_dir + "jtao.1700.1" + pid_list = ["jtao.1700.1"] + for n in range(0, 5): + pid_list.append(f"dou.test.{n}") + + def store_object_wrapper(obj_pid, obj_path): + store.store_object(obj_pid, obj_path) # Call store_object inside the thread + + thread1 = Thread(target=store_object_wrapper, args=(pid_list[0], path)) + thread2 = Thread(target=store_object_wrapper, args=(pid_list[1], path)) + thread3 = Thread(target=store_object_wrapper, args=(pid_list[2], path)) + thread4 = Thread(target=store_object_wrapper, args=(pid_list[3], path)) + thread5 = Thread(target=store_object_wrapper, args=(pid_list[4], path)) + thread6 = Thread(target=store_object_wrapper, args=(pid_list[5], path)) + thread1.start() + thread2.start() + thread3.start() + thread4.start() + thread5.start() + thread6.start() + thread1.join() + thread2.join() + thread3.join() + thread4.join() + thread5.join() + thread6.join() + # One thread will succeed, file count must still be 1 + assert store._count(entity) == 1 + assert store._exists(entity, pids["jtao.1700.1"][store.algorithm]) + + cid_refs_path = store._resolve_path( + "cid", "94f9b6c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a" + ) + number_of_pids_reffed = 0 + with open(cid_refs_path, "r", encoding="utf8") as ref_file: + # Confirm that pid is not currently already tagged + for pid in ref_file: + number_of_pids_reffed += 1 + assert pid.strip() in pid_list + + assert number_of_pids_reffed == 6 + + @slow_test def test_store_object_interrupt_process(store): """Test that tmp file created when storing a large object (2GB) and From ba76b51cd5c75754d4597a7d800e9969282242fb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 27 May 2024 11:58:37 -0700 Subject: [PATCH 208/420] Test: Implement check to see if multiprocessing is being used to select the appropriate thread or multiprocessing 'tag_object' --- src/hashstore/filehashstore.py | 59 ++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e9d1ac7c..75f75d7a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2,6 +2,7 @@ import atexit import io +import multiprocessing import shutil import threading import time @@ -62,6 +63,7 @@ class FileHashStore(HashStore): object_lock = threading.Lock() metadata_lock = threading.Lock() reference_lock = threading.Lock() + reference_lock_mp = multiprocessing.Lock() object_locked_pids = [] metadata_locked_pids = [] reference_locked_cids = [] @@ -573,13 +575,30 @@ def tag_object(self, pid, cid): ) time.sleep(self.time_out_sec) # Modify reference_locked_cids consecutively - with self.reference_lock: - logging.debug( - "FileHashStore - tag_object: Locking cid: %s to to tag pid: %s.", - cid, - pid, - ) - self.reference_locked_cids.append(cid) + use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + if use_multiprocessing: + with self.reference_lock_mp: + logging.debug( + "FileHashStore - tag_object: (mp) Locking cid: %s to to tag pid: %s.", + cid, + pid, + ) + self.reference_locked_cids.append(cid) + else: + with self.reference_lock: + logging.debug( + "FileHashStore - tag_object: Locking cid: %s to to tag pid: %s.", + cid, + pid, + ) + self.reference_locked_cids.append(cid) + # with self.reference_lock: + # logging.debug( + # "FileHashStore - tag_object: Locking cid: %s to to tag pid: %s.", + # cid, + # pid, + # ) + # self.reference_locked_cids.append(cid) try: tmp_root_path = self._get_store_path("refs") / "tmp" pid_refs_path = self._resolve_path("pid", pid) @@ -697,12 +716,26 @@ def tag_object(self, pid, cid): return True finally: # Release cid - with self.reference_lock: - logging.debug( - "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", - cid, - ) - self.reference_locked_cids.remove(cid) + if use_multiprocessing: + with self.reference_lock_mp: + logging.debug( + "FileHashStore - tag_object: (mp) Removing cid: %s from reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) + else: + with self.reference_lock: + logging.debug( + "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) + # with self.reference_lock: + # logging.debug( + # "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", + # cid, + # ) + # self.reference_locked_cids.remove(cid) def find_object(self, pid): logging.debug( From 02599723a405a775785cf80c50cadfd555f9bf14 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 27 May 2024 12:37:59 -0700 Subject: [PATCH 209/420] Test: Revise 'tag_object' synchronization by adding while statement for a shared list if multiprocessing is being used --- src/hashstore/filehashstore.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 75f75d7a..3bc7e3ef 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -64,6 +64,7 @@ class FileHashStore(HashStore): metadata_lock = threading.Lock() reference_lock = threading.Lock() reference_lock_mp = multiprocessing.Lock() + reference_locked_cids_mp = multiprocessing.Manager().list() # Create a shared list object_locked_pids = [] metadata_locked_pids = [] reference_locked_cids = [] @@ -568,23 +569,29 @@ def tag_object(self, pid, cid): self._check_string(pid, "pid", "tag_object") self._check_string(cid, "cid", "tag_object") # Wait for the cid to release if it's being tagged - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", - cid, - ) - time.sleep(self.time_out_sec) # Modify reference_locked_cids consecutively use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" if use_multiprocessing: + while cid in self.reference_locked_cids_mp: + logging.debug( + "FileHashStore - tag_object (mp): (cid) %s is currently locked. Waiting.", + cid, + ) + time.sleep(self.time_out_sec) with self.reference_lock_mp: logging.debug( - "FileHashStore - tag_object: (mp) Locking cid: %s to to tag pid: %s.", + "FileHashStore - tag_object (mp): Locking cid: %s to to tag pid: %s.", cid, pid, ) - self.reference_locked_cids.append(cid) + self.reference_locked_cids_mp.append(cid) else: + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", + cid, + ) + time.sleep(self.time_out_sec) with self.reference_lock: logging.debug( "FileHashStore - tag_object: Locking cid: %s to to tag pid: %s.", @@ -719,10 +726,10 @@ def tag_object(self, pid, cid): if use_multiprocessing: with self.reference_lock_mp: logging.debug( - "FileHashStore - tag_object: (mp) Removing cid: %s from reference_locked_cids.", + "FileHashStore - tag_object (mp): Removing cid: %s from reference_locked_cids.", cid, ) - self.reference_locked_cids.remove(cid) + self.reference_locked_cids_mp.remove(cid) else: with self.reference_lock: logging.debug( From 9a14af55504b4333eb7f7dc322c8c9539cd7823c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 27 May 2024 12:51:48 -0700 Subject: [PATCH 210/420] Test: Set 'USE_MULTIPROCESSING' global variable in HashStoreClient --- src/hashstore/hashstoreclient.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 535c84f3..decb83fb 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -285,6 +285,15 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num, skip_obj_size) ) logging.info(info_msg) + # Test Begin + # Set multiprocessing to true + os.environ["USE_MULTIPROCESSING"] = "True" + use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + logging.info( + "HashStoreClient - use_multiprocessing (bool): %s", use_multiprocessing + ) + # Test End + # Get list of objects to store from metacat db if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( From 57b37bc0c44dceeaf51f237369db646191ca7984 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 27 May 2024 13:21:43 -0700 Subject: [PATCH 211/420] Remove commented out code, retaining test changes for 'tag_object' to use multiprocessing lock and list when global variable set appropriately --- src/hashstore/filehashstore.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3bc7e3ef..366706df 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -574,8 +574,10 @@ def tag_object(self, pid, cid): if use_multiprocessing: while cid in self.reference_locked_cids_mp: logging.debug( - "FileHashStore - tag_object (mp): (cid) %s is currently locked. Waiting.", + "FileHashStore - tag_object (mp): (cid) %s is currently locked. Waiting" + + " to tag pid: %s", cid, + pid, ) time.sleep(self.time_out_sec) with self.reference_lock_mp: @@ -599,13 +601,7 @@ def tag_object(self, pid, cid): pid, ) self.reference_locked_cids.append(cid) - # with self.reference_lock: - # logging.debug( - # "FileHashStore - tag_object: Locking cid: %s to to tag pid: %s.", - # cid, - # pid, - # ) - # self.reference_locked_cids.append(cid) + try: tmp_root_path = self._get_store_path("refs") / "tmp" pid_refs_path = self._resolve_path("pid", pid) @@ -726,8 +722,8 @@ def tag_object(self, pid, cid): if use_multiprocessing: with self.reference_lock_mp: logging.debug( - "FileHashStore - tag_object (mp): Removing cid: %s from reference_locked_cids.", - cid, + "FileHashStore - tag_object (mp): Removing cid: %s from" + + " reference_locked_cids.", cid, ) self.reference_locked_cids_mp.remove(cid) else: @@ -737,12 +733,6 @@ def tag_object(self, pid, cid): cid, ) self.reference_locked_cids.remove(cid) - # with self.reference_lock: - # logging.debug( - # "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", - # cid, - # ) - # self.reference_locked_cids.remove(cid) def find_object(self, pid): logging.debug( From c55dae1db8ac52166943ddcc29b1fef859fdb821 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 27 May 2024 13:23:21 -0700 Subject: [PATCH 212/420] Add todo items for implementing multiprocessing locks --- src/hashstore/filehashstore.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 366706df..7717247b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -450,6 +450,7 @@ def store_object( additional_algorithm, checksum, checksum_algorithm ) + # TODO: Implement multiprocessing lock check like 'tag_object' # Wait for the pid to release if it's in use while pid in self.object_locked_pids: logging.debug( @@ -723,7 +724,8 @@ def tag_object(self, pid, cid): with self.reference_lock_mp: logging.debug( "FileHashStore - tag_object (mp): Removing cid: %s from" - + " reference_locked_cids.", cid, + + " reference_locked_cids.", + cid, ) self.reference_locked_cids_mp.remove(cid) else: @@ -797,6 +799,7 @@ def store_metadata(self, pid, metadata, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "store_metadata") self._check_arg_data(metadata) + # TODO: Implement multiprocessing lock check like 'tag_object' # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: logging.debug( @@ -903,6 +906,7 @@ def delete_object(self, ab_id, id_type=None): # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): cid = ab_id + # TODO: Implement multiprocessing lock check like 'tag_object' # Synchronize the cid while cid in self.reference_locked_cids: logging.debug( @@ -1123,6 +1127,7 @@ def delete_metadata(self, pid, format_id=None): "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) + # TODO: Implement multiprocessing lock check like 'tag_object' self._check_string(pid, "pid", "delete_metadata") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") # Get the metadata directory path for the given pid From c054fe7ba57fd816a64aeadac4e5c41b6b17a667 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 27 May 2024 13:50:46 -0700 Subject: [PATCH 213/420] Test: Refactor 'tag_object' to check existence for files within if-elif syntax instead of pre-generating exist booleans --- src/hashstore/filehashstore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7717247b..10f4e5b6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -607,10 +607,10 @@ def tag_object(self, pid, cid): tmp_root_path = self._get_store_path("refs") / "tmp" pid_refs_path = self._resolve_path("pid", pid) cid_refs_path = self._resolve_path("cid", cid) - pid_ref_abs_path_exists = os.path.exists(pid_refs_path) - cid_ref_abs_path_exists = os.path.exists(cid_refs_path) + # pid_ref_abs_path_exists = os.path.exists(pid_refs_path) + # cid_ref_abs_path_exists = os.path.exists(cid_refs_path) - if pid_ref_abs_path_exists and cid_ref_abs_path_exists: + if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): self._verify_hashstore_references( pid, cid, @@ -619,7 +619,7 @@ def tag_object(self, pid, cid): "Refs file already exists, verifying.", ) return True - elif pid_ref_abs_path_exists and not cid_ref_abs_path_exists: + elif os.path.exists(pid_refs_path) and not os.path.exists(cid_refs_path): debug_msg = ( f"FileHashStore - tag_object: pid refs file exists ({pid_refs_path})" + f" for pid: {pid}, but cid refs file doesn't at: {cid_refs_path}" @@ -669,7 +669,7 @@ def tag_object(self, pid, cid): # Orphaned pid refs file found, the retrieved cid refs file exists # but doesn't contain the cid. Proceed to overwrite the pid refs file. # There is no return statement, so we move out of this if block. - elif not pid_ref_abs_path_exists and cid_ref_abs_path_exists: + elif not os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): debug_msg = ( f"FileHashStore - tag_object: pid refs file does not exists for pid {pid}" + f" but cid refs file exists at: {cid_refs_path} for cid: {cid}" From ecf0517b0eef528ae474044bee62dcc880784879 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 28 May 2024 10:32:12 -0700 Subject: [PATCH 214/420] Improve clarity in 'tag_object' else block by throwing new custom exception and adding a new debug statement, and update pytest --- src/hashstore/filehashstore.py | 34 +++++++++++++++++--------- tests/test_filehashstore_references.py | 4 ++- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 10f4e5b6..caf55340 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -653,22 +653,24 @@ def tag_object(self, pid, cid): else: # Check if the retrieved cid refs file exists and pid is referenced retrieved_cid_refs_path = self._resolve_path("cid", pid_refs_cid) - retrieved_cid_refs_path_exists = os.path.exists( + if os.path.exists( retrieved_cid_refs_path - ) - if retrieved_cid_refs_path_exists and self._is_string_in_refs_file( - pid, retrieved_cid_refs_path - ): + ) and self._is_string_in_refs_file(pid, retrieved_cid_refs_path): # Throw exception, this pid is accounted for - exception_string = ( + error_msg = ( "FileHashStore - tag_object: Pid refs file exists with valid pid" + f" and cid reference files for pid: {pid} with cid: {cid}." ) - logging.error(exception_string) - raise FileExistsError(exception_string) - # Orphaned pid refs file found, the retrieved cid refs file exists - # but doesn't contain the cid. Proceed to overwrite the pid refs file. - # There is no return statement, so we move out of this if block. + logging.error(error_msg) + raise PidRefsExistsError(error_msg) + else: + debug_msg = ( + f"FileHashStore - tag_object: Orphan pid refs file found for {pid}." + + f" Cid ({cid}) reference file does not contain the pid. Proceeding." + ) + logging.debug(debug_msg) + # Orphaned pid refs file found, the retrieved cid refs file exists + # but doesn't contain the cid. Proceed to overwrite the pid refs file. elif not os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): debug_msg = ( f"FileHashStore - tag_object: pid refs file does not exists for pid {pid}" @@ -2436,6 +2438,16 @@ def close(self): self._obj.seek(self._pos) +class PidRefsExistsError(Exception): + """Custom exception thrown when a client calls 'tag_object' and the pid + that is being tagged is already accounted for (has a pid refs file and + is found in the cid refs file).""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + class PidObjectMetadataError(Exception): """Custom exception thrown when an object cannot be verified due to an error with the metadata provided to validate against.""" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 545a537f..8b8fe95c 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -4,6 +4,8 @@ import shutil import pytest +from hashstore.filehashstore import PidRefsExistsError + # pylint: disable=W0212 @@ -112,7 +114,7 @@ def test_tag_object_pid_refs_found_cid_refs_not_found_different_cid_retrieved(st path = test_dir + pid.replace("/", "_") _object_metadata = store.store_object(pid, path) - with pytest.raises(FileExistsError): + with pytest.raises(PidRefsExistsError): store.tag_object(pid, "another_cid_value_that_is_not_found") From 1de704b220e0cb742a2b99d1f546f45914ee9676 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 28 May 2024 10:59:41 -0700 Subject: [PATCH 215/420] Reduce time spent in 'tag_object' try-finally block by moving code to setup tmp refs files and paths outside of synchronization code --- src/hashstore/filehashstore.py | 41 +++++++++++++--------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index caf55340..bbf5743a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -569,8 +569,20 @@ def tag_object(self, pid, cid): ) self._check_string(pid, "pid", "tag_object") self._check_string(cid, "cid", "tag_object") - # Wait for the cid to release if it's being tagged + # Prepare files and paths + tmp_root_path = self._get_store_path("refs") / "tmp" + pid_refs_path = self._resolve_path("pid", pid) + cid_refs_path = self._resolve_path("cid", cid) + # All ref files begin as tmp files and get moved sequentially at once + # Get tmp files with the expected cid and pid refs content + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' + self._create_path(os.path.dirname(pid_refs_path)) + self._create_path(os.path.dirname(cid_refs_path)) + # Modify reference_locked_cids consecutively + # Wait for the cid to release if it's being tagged use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" if use_multiprocessing: while cid in self.reference_locked_cids_mp: @@ -604,12 +616,6 @@ def tag_object(self, pid, cid): self.reference_locked_cids.append(cid) try: - tmp_root_path = self._get_store_path("refs") / "tmp" - pid_refs_path = self._resolve_path("pid", pid) - cid_refs_path = self._resolve_path("cid", cid) - # pid_ref_abs_path_exists = os.path.exists(pid_refs_path) - # cid_ref_abs_path_exists = os.path.exists(cid_refs_path) - if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): self._verify_hashstore_references( pid, @@ -633,8 +639,6 @@ def tag_object(self, pid, cid): if self._is_string_in_refs_file(cid, pid_refs_path): # The pid correctly references the given cid, but the cid refs file is missing - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - self._create_path(os.path.dirname(cid_refs_path)) shutil.move(cid_tmp_file_path, cid_refs_path) self._verify_hashstore_references( pid, @@ -669,17 +673,13 @@ def tag_object(self, pid, cid): + f" Cid ({cid}) reference file does not contain the pid. Proceeding." ) logging.debug(debug_msg) - # Orphaned pid refs file found, the retrieved cid refs file exists - # but doesn't contain the cid. Proceed to overwrite the pid refs file. elif not os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): debug_msg = ( f"FileHashStore - tag_object: pid refs file does not exists for pid {pid}" + f" but cid refs file exists at: {cid_refs_path} for cid: {cid}" ) logging.debug(debug_msg) - # Create the pid refs file - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - self._create_path(os.path.dirname(pid_refs_path)) + # Move the pid refs file shutil.move(pid_tmp_file_path, pid_refs_path) # Update cid ref files as it already exists if not self._is_string_in_refs_file(pid, cid_refs_path): @@ -698,19 +698,10 @@ def tag_object(self, pid, cid): ) return True - # All ref files begin as tmp files and get moved sequentially at once - # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_refs_path)) - self._create_path(os.path.dirname(cid_refs_path)) - # Move both files + # Move both files after checking the existing status of refs files shutil.move(pid_tmp_file_path, pid_refs_path) shutil.move(cid_tmp_file_path, cid_refs_path) - # Ensure that the reference files have been written as expected - # If there is an issue, client or user will have to manually review - log_msg = "Reference files have been moved to their permanent location." + log_msg = "Reference files have been moved to their permanent location. Verifying refs." self._verify_hashstore_references( pid, cid, pid_refs_path, cid_refs_path, log_msg ) From 93f00fefd2281ca96dde1c2ab931369ea9d0a29e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 29 May 2024 13:06:21 -0700 Subject: [PATCH 216/420] Move preparation of refs file back into try statement in 'tag_object' --- src/hashstore/filehashstore.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bbf5743a..56e35b57 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -569,17 +569,6 @@ def tag_object(self, pid, cid): ) self._check_string(pid, "pid", "tag_object") self._check_string(cid, "cid", "tag_object") - # Prepare files and paths - tmp_root_path = self._get_store_path("refs") / "tmp" - pid_refs_path = self._resolve_path("pid", pid) - cid_refs_path = self._resolve_path("cid", cid) - # All ref files begin as tmp files and get moved sequentially at once - # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_refs_path)) - self._create_path(os.path.dirname(cid_refs_path)) # Modify reference_locked_cids consecutively # Wait for the cid to release if it's being tagged @@ -614,8 +603,19 @@ def tag_object(self, pid, cid): pid, ) self.reference_locked_cids.append(cid) - try: + # Prepare files and paths + tmp_root_path = self._get_store_path("refs") / "tmp" + pid_refs_path = self._resolve_path("pid", pid) + cid_refs_path = self._resolve_path("cid", cid) + # All ref files begin as tmp files and get moved sequentially at once + # Get tmp files with the expected cid and pid refs content + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' + self._create_path(os.path.dirname(pid_refs_path)) + self._create_path(os.path.dirname(cid_refs_path)) + if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): self._verify_hashstore_references( pid, From a68da2083f7a42d66d18b6180c834649acbccaa7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 29 May 2024 17:07:11 -0700 Subject: [PATCH 217/420] Revise 'tag_object' to synchronize with threading.Condition --- src/hashstore/filehashstore.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 56e35b57..975bfdfb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -58,16 +58,21 @@ class FileHashStore(HashStore): "blake2b", "blake2s", ] - # Variables to orchestrate thread locking and object store synchronization + # Variables to orchestrate parallelization time_out_sec = 1 + # Thread Synchronization object_lock = threading.Lock() metadata_lock = threading.Lock() reference_lock = threading.Lock() + thread_condition = threading.Condition(reference_lock) + reference_locked_cids = [] + # Multiprocessing Synchronization reference_lock_mp = multiprocessing.Lock() reference_locked_cids_mp = multiprocessing.Manager().list() # Create a shared list + + # TODO: To organize object_locked_pids = [] metadata_locked_pids = [] - reference_locked_cids = [] def __init__(self, properties=None): if properties: @@ -590,18 +595,14 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids_mp.append(cid) else: - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", - cid, - ) - time.sleep(self.time_out_sec) - with self.reference_lock: - logging.debug( - "FileHashStore - tag_object: Locking cid: %s to to tag pid: %s.", - cid, - pid, - ) + with self.thread_condition: + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", + cid, + ) + self.thread_condition.wait() + # with self.cid_lock: self.reference_locked_cids.append(cid) try: # Prepare files and paths @@ -722,12 +723,13 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids_mp.remove(cid) else: - with self.reference_lock: + with self.thread_condition: logging.debug( "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", cid, ) self.reference_locked_cids.remove(cid) + self.thread_condition.notify() def find_object(self, pid): logging.debug( From 36b140cc17b716a20e1aec05e065f82ad27a1df2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 29 May 2024 17:10:57 -0700 Subject: [PATCH 218/420] Revise 'tag_object' to also synchronize with multiprocessing.Condition --- src/hashstore/filehashstore.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 975bfdfb..54337af6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -68,6 +68,7 @@ class FileHashStore(HashStore): reference_locked_cids = [] # Multiprocessing Synchronization reference_lock_mp = multiprocessing.Lock() + multiprocessing_condition = multiprocessing.Condition(reference_lock_mp) reference_locked_cids_mp = multiprocessing.Manager().list() # Create a shared list # TODO: To organize @@ -579,21 +580,15 @@ def tag_object(self, pid, cid): # Wait for the cid to release if it's being tagged use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" if use_multiprocessing: - while cid in self.reference_locked_cids_mp: - logging.debug( - "FileHashStore - tag_object (mp): (cid) %s is currently locked. Waiting" - + " to tag pid: %s", - cid, - pid, - ) - time.sleep(self.time_out_sec) - with self.reference_lock_mp: - logging.debug( - "FileHashStore - tag_object (mp): Locking cid: %s to to tag pid: %s.", - cid, - pid, - ) - self.reference_locked_cids_mp.append(cid) + with self.multiprocessing_condition: + while cid in self.reference_locked_cids_mp: + logging.debug( + "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", + cid, + ) + self.multiprocessing_condition.wait() + # Add cid to tracking array + self.reference_locked_cids.append(cid) else: with self.thread_condition: while cid in self.reference_locked_cids: @@ -602,7 +597,7 @@ def tag_object(self, pid, cid): cid, ) self.thread_condition.wait() - # with self.cid_lock: + # Add cid to tracking array self.reference_locked_cids.append(cid) try: # Prepare files and paths @@ -715,13 +710,14 @@ def tag_object(self, pid, cid): finally: # Release cid if use_multiprocessing: - with self.reference_lock_mp: + with self.multiprocessing_condition: logging.debug( "FileHashStore - tag_object (mp): Removing cid: %s from" + " reference_locked_cids.", cid, ) self.reference_locked_cids_mp.remove(cid) + self.multiprocessing_condition.notify() else: with self.thread_condition: logging.debug( From 9ddc79d71fd6e1c318f3f5b568c4ad2b008cf7ee Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 29 May 2024 17:13:38 -0700 Subject: [PATCH 219/420] Fix bug with 'tag_object' multiprocessing sync adding 'cid' to the incorrect list --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 54337af6..e9b9b5e9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -588,7 +588,7 @@ def tag_object(self, pid, cid): ) self.multiprocessing_condition.wait() # Add cid to tracking array - self.reference_locked_cids.append(cid) + self.reference_locked_cids_mp.append(cid) else: with self.thread_condition: while cid in self.reference_locked_cids: From d8df5825a4199563a350ff9b6e7b1e5c9f2826f7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 30 May 2024 09:42:32 -0700 Subject: [PATCH 220/420] Rename custom exception from 'PidRefsExistsError' to 'PidAlreadyExistsError' and update pytest --- src/hashstore/filehashstore.py | 4 ++-- tests/test_filehashstore_references.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e9b9b5e9..6b77c9e7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -662,7 +662,7 @@ def tag_object(self, pid, cid): + f" and cid reference files for pid: {pid} with cid: {cid}." ) logging.error(error_msg) - raise PidRefsExistsError(error_msg) + raise PidAlreadyExistsError(error_msg) else: debug_msg = ( f"FileHashStore - tag_object: Orphan pid refs file found for {pid}." @@ -2427,7 +2427,7 @@ def close(self): self._obj.seek(self._pos) -class PidRefsExistsError(Exception): +class PidAlreadyExistsError(Exception): """Custom exception thrown when a client calls 'tag_object' and the pid that is being tagged is already accounted for (has a pid refs file and is found in the cid refs file).""" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 8b8fe95c..22a4c7ba 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -4,7 +4,7 @@ import shutil import pytest -from hashstore.filehashstore import PidRefsExistsError +from hashstore.filehashstore import PidAlreadyExistsError # pylint: disable=W0212 @@ -114,7 +114,7 @@ def test_tag_object_pid_refs_found_cid_refs_not_found_different_cid_retrieved(st path = test_dir + pid.replace("/", "_") _object_metadata = store.store_object(pid, path) - with pytest.raises(PidRefsExistsError): + with pytest.raises(PidAlreadyExistsError): store.tag_object(pid, "another_cid_value_that_is_not_found") From b3da71214d9078cf04589befacae39248883bfc1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 30 May 2024 12:02:22 -0700 Subject: [PATCH 221/420] Revise comment and add note in 'test_filehashstore_interface' module --- tests/test_filehashstore_interface.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 5511d11b..a36ba0cc 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,6 +1,7 @@ """Test module for FileHashStore HashStore interface methods.""" import io +import multiprocessing import os from pathlib import Path from threading import Thread @@ -497,6 +498,12 @@ def store_object_wrapper(obj_pid, obj_path): assert store._exists(entity, pids[pid][store.algorithm]) +# Note: +# Multiprocessing has been tested through the HashStore client using +# metacat db data from 'test.arcticdata.io'. When time-permitting, +# implement a multiprocessing test + + def test_store_object_threads_multiple_pids_one_cid(pids, store): """Test store object thread lock.""" entity = "objects" @@ -527,7 +534,7 @@ def store_object_wrapper(obj_pid, obj_path): thread4.join() thread5.join() thread6.join() - # One thread will succeed, file count must still be 1 + # All threads will succeed, file count must still be 1 assert store._count(entity) == 1 assert store._exists(entity, pids["jtao.1700.1"][store.algorithm]) From fccc792d01f7135f903ddfbfb42d95fac66a34eb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 30 May 2024 12:03:49 -0700 Subject: [PATCH 222/420] Remove unused import --- tests/test_filehashstore_interface.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index a36ba0cc..1ae3d2a0 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,7 +1,6 @@ """Test module for FileHashStore HashStore interface methods.""" import io -import multiprocessing import os from pathlib import Path from threading import Thread From 78ddad7e3cc2e71767321d5d4494460806ed810e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 30 May 2024 12:49:23 -0700 Subject: [PATCH 223/420] Add new pytest (failing) to confirm issue of tmp files remaining --- tests/test_filehashstore_interface.py | 55 ++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 1ae3d2a0..58fd69e8 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -503,7 +503,7 @@ def store_object_wrapper(obj_pid, obj_path): # implement a multiprocessing test -def test_store_object_threads_multiple_pids_one_cid(pids, store): +def test_store_object_threads_multiple_pids_one_cid_content(pids, store): """Test store object thread lock.""" entity = "objects" test_dir = "tests/testdata/" @@ -550,6 +550,59 @@ def store_object_wrapper(obj_pid, obj_path): assert number_of_pids_reffed == 6 +def test_store_object_threads_multiple_pids_one_cid_files(store): + """Test store object with threads produces the right amount of files""" + test_dir = "tests/testdata/" + path = test_dir + "jtao.1700.1" + pid_list = ["jtao.1700.1"] + for n in range(0, 5): + pid_list.append(f"dou.test.{n}") + + def store_object_wrapper(obj_pid, obj_path): + store.store_object(obj_pid, obj_path) # Call store_object inside the thread + + thread1 = Thread(target=store_object_wrapper, args=(pid_list[0], path)) + thread2 = Thread(target=store_object_wrapper, args=(pid_list[1], path)) + thread3 = Thread(target=store_object_wrapper, args=(pid_list[2], path)) + thread4 = Thread(target=store_object_wrapper, args=(pid_list[3], path)) + thread5 = Thread(target=store_object_wrapper, args=(pid_list[4], path)) + thread6 = Thread(target=store_object_wrapper, args=(pid_list[5], path)) + thread1.start() + thread2.start() + thread3.start() + thread4.start() + thread5.start() + thread6.start() + thread1.join() + thread2.join() + thread3.join() + thread4.join() + thread5.join() + thread6.join() + + # Confirm that tmp files do not remain in refs + def folder_has_files(folder_path): + # Iterate over directory contents + for _, _, files in os.walk(folder_path): + if files: # If there are any files in the folder + print(files) + return True + return False + + # Confirm that tmp files do not remain in refs + def get_number_of_files(folder_path): + # Iterate over directory contents + file_count = 0 + for _, _, files in os.walk(folder_path): + if files: # If there are any files in the folder + file_count += len(files) + return file_count + + assert get_number_of_files(store.refs + "/pids") == 6 + assert get_number_of_files(store.refs + "/cids") == 1 + assert folder_has_files(store.refs + "/tmp") is False + + @slow_test def test_store_object_interrupt_process(store): """Test that tmp file created when storing a large object (2GB) and From 5ac31ce538d8af405e98e98c17b256d951ca50b6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 30 May 2024 13:19:40 -0700 Subject: [PATCH 224/420] Update docstrings in 'test_filehashstore_interface' for new pytests --- tests/test_filehashstore_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 58fd69e8..83586ff5 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -504,7 +504,7 @@ def store_object_wrapper(obj_pid, obj_path): def test_store_object_threads_multiple_pids_one_cid_content(pids, store): - """Test store object thread lock.""" + """Test store object thread lock and refs files content""" entity = "objects" test_dir = "tests/testdata/" path = test_dir + "jtao.1700.1" @@ -551,7 +551,7 @@ def store_object_wrapper(obj_pid, obj_path): def test_store_object_threads_multiple_pids_one_cid_files(store): - """Test store object with threads produces the right amount of files""" + """Test store object with threads produces the expected amount of files""" test_dir = "tests/testdata/" path = test_dir + "jtao.1700.1" pid_list = ["jtao.1700.1"] From cf21ee232a48f22d8e151408e6e04e619ffb1f85 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 30 May 2024 13:35:43 -0700 Subject: [PATCH 225/420] Revise 'tag_object' to create tmp refs files when required --- src/hashstore/filehashstore.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6b77c9e7..5821c6a4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -604,10 +604,6 @@ def tag_object(self, pid, cid): tmp_root_path = self._get_store_path("refs") / "tmp" pid_refs_path = self._resolve_path("pid", pid) cid_refs_path = self._resolve_path("cid", cid) - # All ref files begin as tmp files and get moved sequentially at once - # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' self._create_path(os.path.dirname(pid_refs_path)) self._create_path(os.path.dirname(cid_refs_path)) @@ -635,6 +631,7 @@ def tag_object(self, pid, cid): if self._is_string_in_refs_file(cid, pid_refs_path): # The pid correctly references the given cid, but the cid refs file is missing + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") shutil.move(cid_tmp_file_path, cid_refs_path) self._verify_hashstore_references( pid, @@ -676,6 +673,7 @@ def tag_object(self, pid, cid): ) logging.debug(debug_msg) # Move the pid refs file + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") shutil.move(pid_tmp_file_path, pid_refs_path) # Update cid ref files as it already exists if not self._is_string_in_refs_file(pid, cid_refs_path): @@ -695,6 +693,8 @@ def tag_object(self, pid, cid): return True # Move both files after checking the existing status of refs files + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") shutil.move(pid_tmp_file_path, pid_refs_path) shutil.move(cid_tmp_file_path, cid_refs_path) log_msg = "Reference files have been moved to their permanent location. Verifying refs." From 5aab6816f10eeeee44c47ce4cf51219a57371775 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 10:00:13 -0700 Subject: [PATCH 226/420] Remove organizing lock TODO item, add new threading 'Condition' for objects --- src/hashstore/filehashstore.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5821c6a4..4c7d01bf 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -62,7 +62,11 @@ class FileHashStore(HashStore): time_out_sec = 1 # Thread Synchronization object_lock = threading.Lock() + object_condition = threading.Condition(object_lock) + object_locked_pids = [] metadata_lock = threading.Lock() + # metadata_condition = threading.Condition(object_lock) + metadata_locked_pids = [] reference_lock = threading.Lock() thread_condition = threading.Condition(reference_lock) reference_locked_cids = [] @@ -71,10 +75,6 @@ class FileHashStore(HashStore): multiprocessing_condition = multiprocessing.Condition(reference_lock_mp) reference_locked_cids_mp = multiprocessing.Manager().list() # Create a shared list - # TODO: To organize - object_locked_pids = [] - metadata_locked_pids = [] - def __init__(self, properties=None): if properties: # Validate properties against existing configuration if present From bf569810abbc27dbd1e0a951e0b68db179aecfa6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 10:09:03 -0700 Subject: [PATCH 227/420] Refactor 'store_object' to use condition synchronization --- src/hashstore/filehashstore.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4c7d01bf..6ede5e7b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -458,18 +458,14 @@ def store_object( # TODO: Implement multiprocessing lock check like 'tag_object' # Wait for the pid to release if it's in use - while pid in self.object_locked_pids: - logging.debug( - "FileHashStore - store_object: %s is currently being stored. Waiting.", - pid, - ) - time.sleep(self.time_out_sec) - # Modify object_locked_pids consecutively - with self.object_lock: - logging.debug( - "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", - pid, - ) + with self.object_condition: + while pid in self.object_locked_pids: + logging.debug( + "FileHashStore - store_object: %s is currently being stored. Waiting.", + pid, + ) + self.object_condition.wait() + # Modify object_locked_pids consecutively self.object_locked_pids.append(pid) try: logging.debug( @@ -512,12 +508,13 @@ def store_object( raise err finally: # Release pid - with self.object_lock: + with self.object_condition: logging.debug( "FileHashStore - store_object: Removing pid: %s from object_locked_pids.", pid, ) self.object_locked_pids.remove(pid) + self.object_condition.notify() return object_metadata From 9c483f23eaeb3606121bdf8ea8a75f5280fa6e4b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 10:24:16 -0700 Subject: [PATCH 228/420] Implement 'multiprocessing' lock for 'store_object' --- src/hashstore/filehashstore.py | 69 ++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 24 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6ede5e7b..c85066cd 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -71,9 +71,12 @@ class FileHashStore(HashStore): thread_condition = threading.Condition(reference_lock) reference_locked_cids = [] # Multiprocessing Synchronization + object_lock_mp = multiprocessing.Lock() + object_condition_mp = multiprocessing.Condition(object_lock_mp) + object_locked_pids_mp = multiprocessing.Manager().list() reference_lock_mp = multiprocessing.Lock() - multiprocessing_condition = multiprocessing.Condition(reference_lock_mp) - reference_locked_cids_mp = multiprocessing.Manager().list() # Create a shared list + reference_condition_mp = multiprocessing.Condition(reference_lock_mp) + reference_locked_cids_mp = multiprocessing.Manager().list() def __init__(self, properties=None): if properties: @@ -456,17 +459,26 @@ def store_object( additional_algorithm, checksum, checksum_algorithm ) - # TODO: Implement multiprocessing lock check like 'tag_object' # Wait for the pid to release if it's in use - with self.object_condition: - while pid in self.object_locked_pids: - logging.debug( - "FileHashStore - store_object: %s is currently being stored. Waiting.", - pid, - ) - self.object_condition.wait() - # Modify object_locked_pids consecutively - self.object_locked_pids.append(pid) + use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + if use_multiprocessing: + with self.object_condition_mp: + while pid in self.object_locked_pids_mp: + logging.debug( + "FileHashStore - store_object (mp): pid (%s) is locked. Waiting.", + pid, + ) + self.object_condition_mp.wait() + self.object_locked_pids_mp.append(pid) + else: + with self.object_condition: + while pid in self.object_locked_pids: + logging.debug( + "FileHashStore - store_object: pid (%s) is locked. Waiting.", + pid, + ) + self.object_condition.wait() + self.object_locked_pids.append(pid) try: logging.debug( "FileHashStore - store_object: Attempting to store object for pid: %s", @@ -507,14 +519,23 @@ def store_object( logging.error(exception_string) raise err finally: - # Release pid - with self.object_condition: - logging.debug( - "FileHashStore - store_object: Removing pid: %s from object_locked_pids.", - pid, - ) - self.object_locked_pids.remove(pid) - self.object_condition.notify() + if use_multiprocessing: + with self.object_condition_mp: + logging.debug( + "FileHashStore - store_object (mp): Removing pid: %s from lock array", + pid, + ) + self.object_locked_pids_mp.remove(pid) + self.object_condition_mp.notify() + else: + # Release pid + with self.object_condition: + logging.debug( + "FileHashStore - store_object: Removing pid: %s from lock array", + pid, + ) + self.object_locked_pids.remove(pid) + self.object_condition.notify() return object_metadata @@ -577,13 +598,13 @@ def tag_object(self, pid, cid): # Wait for the cid to release if it's being tagged use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" if use_multiprocessing: - with self.multiprocessing_condition: + with self.reference_condition_mp: while cid in self.reference_locked_cids_mp: logging.debug( "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", cid, ) - self.multiprocessing_condition.wait() + self.reference_condition_mp.wait() # Add cid to tracking array self.reference_locked_cids_mp.append(cid) else: @@ -707,14 +728,14 @@ def tag_object(self, pid, cid): finally: # Release cid if use_multiprocessing: - with self.multiprocessing_condition: + with self.reference_condition_mp: logging.debug( "FileHashStore - tag_object (mp): Removing cid: %s from" + " reference_locked_cids.", cid, ) self.reference_locked_cids_mp.remove(cid) - self.multiprocessing_condition.notify() + self.reference_condition_mp.notify() else: with self.thread_condition: logging.debug( From c340268efdef0dea8986ff6ff8969297d444e9da Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 11:58:59 -0700 Subject: [PATCH 229/420] Refactor 'store_metadata' to use condition synchronization --- src/hashstore/filehashstore.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c85066cd..4b2cf898 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -65,7 +65,7 @@ class FileHashStore(HashStore): object_condition = threading.Condition(object_lock) object_locked_pids = [] metadata_lock = threading.Lock() - # metadata_condition = threading.Condition(object_lock) + metadata_condition = threading.Condition(metadata_lock) metadata_locked_pids = [] reference_lock = threading.Lock() thread_condition = threading.Condition(reference_lock) @@ -809,20 +809,13 @@ def store_metadata(self, pid, metadata, format_id=None): self._check_arg_data(metadata) # TODO: Implement multiprocessing lock check like 'tag_object' - # Wait for the pid to release if it's in use - while pid in self.metadata_locked_pids: - logging.debug( - "FileHashStore - store_metadata: %s is currently being stored. Waiting.", - pid, - ) - time.sleep(self.time_out_sec) - - with self.metadata_lock: - logging.debug( - "FileHashStore - store_metadata: Adding pid: %s to metadata_locked_pids.", - pid, - ) - # Modify metadata_locked_pids consecutively + with self.metadata_condition: + while pid in self.metadata_locked_pids: + logging.debug( + "FileHashStore - store_metadata: %s is currently being stored. Waiting.", + pid, + ) + self.metadata_condition.wait() self.metadata_locked_pids.append(pid) try: @@ -839,12 +832,13 @@ def store_metadata(self, pid, metadata, format_id=None): return metadata_cid finally: # Release pid - with self.metadata_lock: + with self.metadata_condition: logging.debug( "FileHashStore - store_metadata: Removing pid: %s from metadata_locked_pids.", pid, ) self.metadata_locked_pids.remove(pid) + self.metadata_condition.notify() def retrieve_object(self, pid): logging.debug( From 763b601ded781d3058749cbef6b9127dd56710af Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 12:06:17 -0700 Subject: [PATCH 230/420] Implement 'multiprocessing' lock for 'store_metadata' --- src/hashstore/filehashstore.py | 55 ++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4b2cf898..e7e9a65c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -74,6 +74,9 @@ class FileHashStore(HashStore): object_lock_mp = multiprocessing.Lock() object_condition_mp = multiprocessing.Condition(object_lock_mp) object_locked_pids_mp = multiprocessing.Manager().list() + metadata_lock_mp = multiprocessing.Lock() + metadata_condition_mp = multiprocessing.Condition(metadata_lock_mp) + metadata_locked_pids_mp = multiprocessing.Manager().list() reference_lock_mp = multiprocessing.Lock() reference_condition_mp = multiprocessing.Condition(reference_lock_mp) reference_locked_cids_mp = multiprocessing.Manager().list() @@ -808,15 +811,26 @@ def store_metadata(self, pid, metadata, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "store_metadata") self._check_arg_data(metadata) - # TODO: Implement multiprocessing lock check like 'tag_object' - with self.metadata_condition: - while pid in self.metadata_locked_pids: - logging.debug( - "FileHashStore - store_metadata: %s is currently being stored. Waiting.", - pid, - ) - self.metadata_condition.wait() - self.metadata_locked_pids.append(pid) + # Wait for the pid to release if it's in use + use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + if use_multiprocessing: + with self.metadata_condition_mp: + while pid in self.metadata_locked_pids_mp: + logging.debug( + "FileHashStore - store_metadata (mp): %s (pid) is locked. Waiting.", + pid, + ) + self.metadata_condition_mp.wait() + self.metadata_locked_pids_mp.append(pid) + else: + with self.metadata_condition: + while pid in self.metadata_locked_pids: + logging.debug( + "FileHashStore - store_metadata: %s (pid) is locked. Waiting.", + pid, + ) + self.metadata_condition.wait() + self.metadata_locked_pids.append(pid) try: logging.debug( @@ -832,13 +846,22 @@ def store_metadata(self, pid, metadata, format_id=None): return metadata_cid finally: # Release pid - with self.metadata_condition: - logging.debug( - "FileHashStore - store_metadata: Removing pid: %s from metadata_locked_pids.", - pid, - ) - self.metadata_locked_pids.remove(pid) - self.metadata_condition.notify() + if use_multiprocessing: + with self.metadata_condition_mp: + logging.debug( + "FileHashStore - store_metadata (mp): Removing pid: %s from lock array", + pid, + ) + self.metadata_locked_pids_mp.remove(pid) + self.metadata_condition_mp.notify() + else: + with self.metadata_condition: + logging.debug( + "FileHashStore - store_metadata: Removing pid: %s from lock array.", + pid, + ) + self.metadata_locked_pids.remove(pid) + self.metadata_condition.notify() def retrieve_object(self, pid): logging.debug( From 73fc7a9cb1e20543892702b1caceeb32410dae6e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 12:43:07 -0700 Subject: [PATCH 231/420] Refactor 'delete_object' to use condition synchronization --- src/hashstore/filehashstore.py | 87 ++++++++++++++++------------------ 1 file changed, 40 insertions(+), 47 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e7e9a65c..9d340294 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -68,7 +68,7 @@ class FileHashStore(HashStore): metadata_condition = threading.Condition(metadata_lock) metadata_locked_pids = [] reference_lock = threading.Lock() - thread_condition = threading.Condition(reference_lock) + reference_condition = threading.Condition(reference_lock) reference_locked_cids = [] # Multiprocessing Synchronization object_lock_mp = multiprocessing.Lock() @@ -81,6 +81,8 @@ class FileHashStore(HashStore): reference_condition_mp = multiprocessing.Condition(reference_lock_mp) reference_locked_cids_mp = multiprocessing.Manager().list() + # TODO: Review store/delete object/metadata debug messaging for consistency + def __init__(self, properties=None): if properties: # Validate properties against existing configuration if present @@ -611,13 +613,13 @@ def tag_object(self, pid, cid): # Add cid to tracking array self.reference_locked_cids_mp.append(cid) else: - with self.thread_condition: + with self.reference_condition: while cid in self.reference_locked_cids: logging.debug( "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", cid, ) - self.thread_condition.wait() + self.reference_condition.wait() # Add cid to tracking array self.reference_locked_cids.append(cid) try: @@ -740,13 +742,13 @@ def tag_object(self, pid, cid): self.reference_locked_cids_mp.remove(cid) self.reference_condition_mp.notify() else: - with self.thread_condition: + with self.reference_condition: logging.debug( "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", cid, ) self.reference_locked_cids.remove(cid) - self.thread_condition.notify() + self.reference_condition.notify() def find_object(self, pid): logging.debug( @@ -934,31 +936,28 @@ def delete_object(self, ab_id, id_type=None): cid = ab_id # TODO: Implement multiprocessing lock check like 'tag_object' # Synchronize the cid - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", - cid, - ) - time.sleep(self.time_out_sec) - # Modify reference_locked_cids consecutively - with self.reference_lock: - logging.debug( - "FileHashStore - delete_object: Add cid: %s to reference_locked_cids.", - cid, - ) - self.reference_locked_cids.append(ab_id) + with self.reference_condition: + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is locked. Waiting", + cid, + ) + self.reference_condition.wait() + # Modify reference_locked_cids consecutively + self.reference_locked_cids.append(cid) try: - self._delete("objects", ab_id) + self._delete("objects", cid) finally: # Release cid - with self.reference_lock: + with self.reference_condition: logging.debug( "FileHashStore - delete_object: Removing cid: %s from" - + "reference_locked_cids.", + + "lock array.", cid, ) self.reference_locked_cids.remove(cid) + self.reference_condition.notify() else: # id_type is "pid" pid = ab_id @@ -971,18 +970,14 @@ def delete_object(self, ab_id, id_type=None): # Storing and deleting objects are synchronized together # Duplicate store object requests for a pid are rejected, but deleting an object # will wait for a pid to be released if it's found to be in use before proceeding. - while pid in self.object_locked_pids: - logging.debug( - "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", - pid, - ) - time.sleep(self.time_out_sec) - # Modify object_locked_pids consecutively - with self.object_lock: - logging.debug( - "FileHashStore - delete_object: Adding pid: %s to object_locked_pids.", - pid, - ) + with self.object_condition: + while pid in self.object_locked_pids: + logging.debug( + "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", + pid, + ) + self.object_condition.wait() + # Modify object_locked_pids consecutively self.object_locked_pids.append(pid) try: @@ -995,18 +990,14 @@ def delete_object(self, ab_id, id_type=None): # Proceed with next steps - cid has been retrieved without any issues # We must synchronized here based on the `cid` because multiple threads may # try to access the `cid_reference_file` - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", - cid, - ) - time.sleep(self.time_out_sec) - # Modify reference_locked_cids consecutively - with self.reference_lock: - logging.debug( - "FileHashStore - delete_object: Add cid: %s to reference_locked_cids.", - cid, - ) + with self.reference_condition: + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is locked. Waiting", + cid, + ) + self.reference_condition.wait() + # Modify reference_locked_cids consecutively self.reference_locked_cids.append(cid) try: @@ -1052,13 +1043,14 @@ def delete_object(self, ab_id, id_type=None): finally: # Release cid - with self.reference_lock: + with self.reference_condition: debug_msg = ( "FileHashStore - delete_object:" + f" Removing cid: {cid} from reference_locked_cids." ) logging.debug(debug_msg) self.reference_locked_cids.remove(cid) + self.reference_condition.notify() except PidRefsDoesNotExist: warn_msg = ( @@ -1141,12 +1133,13 @@ def delete_object(self, ab_id, id_type=None): return finally: # Release pid - with self.object_lock: + with self.object_condition: logging.debug( "FileHashStore - delete_object: Removing pid: %s from object_locked_pids.", pid, ) self.object_locked_pids.remove(pid) + self.object_condition.notify() def delete_metadata(self, pid, format_id=None): logging.debug( From d6f19f29f94aed77c050445f69f82d2a79022cb1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 12:52:34 -0700 Subject: [PATCH 232/420] Implement 'multiprocessing' lock for 'delete_object' --- src/hashstore/filehashstore.py | 137 +++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 42 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9d340294..7bcc819b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -82,6 +82,7 @@ class FileHashStore(HashStore): reference_locked_cids_mp = multiprocessing.Manager().list() # TODO: Review store/delete object/metadata debug messaging for consistency + # TODO: Review if there is a better way to retrieve global env 'use_multiprocessing' def __init__(self, properties=None): if properties: @@ -929,35 +930,58 @@ def delete_object(self, ab_id, id_type=None): "FileHashStore - delete_object: Request to delete object for id: %s", ab_id ) self._check_string(ab_id, "ab_id", "delete_object") + use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + if id_type == "cid": cid_refs_abs_path = self._resolve_path("cid", ab_id) # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): cid = ab_id - # TODO: Implement multiprocessing lock check like 'tag_object' + # Synchronize the cid - with self.reference_condition: - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - delete_object: (cid) %s is locked. Waiting", - cid, - ) - self.reference_condition.wait() - # Modify reference_locked_cids consecutively - self.reference_locked_cids.append(cid) + if use_multiprocessing: + with self.reference_condition_mp: + while cid in self.reference_locked_cids_mp: + logging.debug( + "FileHashStore - delete_object: (cid) %s is locked. Waiting", + cid, + ) + self.reference_condition_mp.wait() + # Add cid to tracking array + self.reference_locked_cids_mp.append(cid) + else: + with self.reference_condition: + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is locked. Waiting", + cid, + ) + self.reference_condition.wait() + # Modify reference_locked_cids consecutively + self.reference_locked_cids.append(cid) try: self._delete("objects", cid) finally: # Release cid - with self.reference_condition: - logging.debug( - "FileHashStore - delete_object: Removing cid: %s from" - + "lock array.", - cid, - ) - self.reference_locked_cids.remove(cid) - self.reference_condition.notify() + if use_multiprocessing: + with self.reference_condition_mp: + logging.debug( + "FileHashStore - delete_object: Removing cid: %s from" + + "lock array.", + cid, + ) + self.reference_locked_cids_mp.remove(cid) + self.reference_condition_mp.notify() + else: + with self.reference_condition: + logging.debug( + "FileHashStore - delete_object: Removing cid: %s from" + + "lock array.", + cid, + ) + self.reference_locked_cids.remove(cid) + self.reference_condition.notify() else: # id_type is "pid" pid = ab_id @@ -970,15 +994,25 @@ def delete_object(self, ab_id, id_type=None): # Storing and deleting objects are synchronized together # Duplicate store object requests for a pid are rejected, but deleting an object # will wait for a pid to be released if it's found to be in use before proceeding. - with self.object_condition: - while pid in self.object_locked_pids: - logging.debug( - "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", - pid, - ) - self.object_condition.wait() - # Modify object_locked_pids consecutively - self.object_locked_pids.append(pid) + if use_multiprocessing: + with self.object_condition_mp: + while pid in self.object_locked_pids_mp: + logging.debug( + "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", + pid, + ) + self.object_condition_mp.wait() + self.object_locked_pids_mp.append(pid) + else: + with self.object_condition: + while pid in self.object_locked_pids: + logging.debug( + "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", + pid, + ) + self.object_condition.wait() + # Modify object_locked_pids consecutively + self.object_locked_pids.append(pid) try: # Before we begin deletion process, we look for the `cid` by calling @@ -1043,14 +1077,24 @@ def delete_object(self, ab_id, id_type=None): finally: # Release cid - with self.reference_condition: - debug_msg = ( - "FileHashStore - delete_object:" - + f" Removing cid: {cid} from reference_locked_cids." - ) - logging.debug(debug_msg) - self.reference_locked_cids.remove(cid) - self.reference_condition.notify() + if use_multiprocessing: + with self.reference_condition_mp: + debug_msg = ( + "FileHashStore - delete_object:" + + f" Removing cid: {cid} from reference_locked_cids." + ) + logging.debug(debug_msg) + self.reference_locked_cids_mp.remove(cid) + self.reference_condition_mp.notify() + else: + with self.reference_condition: + debug_msg = ( + "FileHashStore - delete_object:" + + f" Removing cid: {cid} from reference_locked_cids." + ) + logging.debug(debug_msg) + self.reference_locked_cids.remove(cid) + self.reference_condition.notify() except PidRefsDoesNotExist: warn_msg = ( @@ -1133,13 +1177,22 @@ def delete_object(self, ab_id, id_type=None): return finally: # Release pid - with self.object_condition: - logging.debug( - "FileHashStore - delete_object: Removing pid: %s from object_locked_pids.", - pid, - ) - self.object_locked_pids.remove(pid) - self.object_condition.notify() + if use_multiprocessing: + with self.object_condition_mp: + logging.debug( + "FileHashStore - delete_object: Removing pid: %s from object_locked_pids.", + pid, + ) + self.object_locked_pids_mp.remove(pid) + self.object_condition_mp.notify() + else: + with self.object_condition: + logging.debug( + "FileHashStore - delete_object: Removing pid: %s from object_locked_pids.", + pid, + ) + self.object_locked_pids.remove(pid) + self.object_condition.notify() def delete_metadata(self, pid, format_id=None): logging.debug( From d8c6117591cef63e553668f614db825e857c0bef Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 12:56:37 -0700 Subject: [PATCH 233/420] Implement thread and multiprocessing synchronization for 'delete_metadata' --- src/hashstore/filehashstore.py | 109 ++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 35 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7bcc819b..352fc83e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1199,44 +1199,83 @@ def delete_metadata(self, pid, format_id=None): "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - # TODO: Implement multiprocessing lock check like 'tag_object' self._check_string(pid, "pid", "delete_metadata") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") - # Get the metadata directory path for the given pid - entity = "metadata" - metadata_directory = self._computehash(pid) - rel_path = "/".join(self._shard(metadata_directory)) - metadata_rel_path = self._get_store_path("metadata") / rel_path - if format_id is None: - # Delete all metadata files - objects_to_delete = [] - metadata_file_paths = self._get_file_paths(metadata_rel_path) - if metadata_file_paths is not None: - for path in metadata_file_paths: - objects_to_delete.append(self._rename_path_for_deletion(path)) - for obj in objects_to_delete: - os.remove(obj) - - info_string = ( - "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", - pid, - ) - logging.info(info_string) - return + + # Wait for the pid to release if it's in use + use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + if use_multiprocessing: + with self.metadata_condition_mp: + while pid in self.metadata_locked_pids_mp: + logging.debug( + "FileHashStore - store_metadata (mp): %s (pid) is locked. Waiting.", + pid, + ) + self.metadata_condition_mp.wait() + self.metadata_locked_pids_mp.append(pid) else: - # Delete a specific metadata file - metadata_document_name = self._computehash(pid + checked_format_id) - full_path_without_directory = rel_path + "/" + metadata_document_name - metadata_exists = self._exists(entity, full_path_without_directory) - if metadata_exists: - self._delete(entity, full_path_without_directory) - - info_string = ( - "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" - + f" {pid} for format_id: {format_id}" - ) - logging.info(info_string) - return + with self.metadata_condition: + while pid in self.metadata_locked_pids: + logging.debug( + "FileHashStore - store_metadata: %s (pid) is locked. Waiting.", + pid, + ) + self.metadata_condition.wait() + self.metadata_locked_pids.append(pid) + try: + # Get the metadata directory path for the given pid + entity = "metadata" + metadata_directory = self._computehash(pid) + rel_path = "/".join(self._shard(metadata_directory)) + metadata_rel_path = self._get_store_path("metadata") / rel_path + if format_id is None: + # Delete all metadata files + objects_to_delete = [] + metadata_file_paths = self._get_file_paths(metadata_rel_path) + if metadata_file_paths is not None: + for path in metadata_file_paths: + objects_to_delete.append(self._rename_path_for_deletion(path)) + for obj in objects_to_delete: + os.remove(obj) + + info_string = ( + "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", + pid, + ) + logging.info(info_string) + return + else: + # Delete a specific metadata file + metadata_document_name = self._computehash(pid + checked_format_id) + full_path_without_directory = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, full_path_without_directory) + if metadata_exists: + self._delete(entity, full_path_without_directory) + + info_string = ( + "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" + + f" {pid} for format_id: {format_id}" + ) + logging.info(info_string) + return + finally: + # Release pid + if use_multiprocessing: + with self.metadata_condition_mp: + logging.debug( + "FileHashStore - store_metadata (mp): Removing pid: %s from lock array", + pid, + ) + self.metadata_locked_pids_mp.remove(pid) + self.metadata_condition_mp.notify() + else: + with self.metadata_condition: + logging.debug( + "FileHashStore - store_metadata: Removing pid: %s from lock array.", + pid, + ) + self.metadata_locked_pids.remove(pid) + self.metadata_condition.notify() def get_hex_digest(self, pid, algorithm): logging.debug( From d368d04ecb0bc5b34d8e22404979e1e061e8e95f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 13:17:59 -0700 Subject: [PATCH 234/420] Clean up 'store_object' method --- src/hashstore/filehashstore.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 352fc83e..2c41fb9d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -467,23 +467,25 @@ def store_object( # Wait for the pid to release if it's in use use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + sync_begin_debug_msg = ( + f"FileHashStore - store_object: Adding pid ({pid}) to locked list." + ) + sync_wait_msg = ( + f"FileHashStore - store_object: Pid ({pid}) is locked. Waiting." + ) if use_multiprocessing: with self.object_condition_mp: while pid in self.object_locked_pids_mp: - logging.debug( - "FileHashStore - store_object (mp): pid (%s) is locked. Waiting.", - pid, - ) + logging.debug(sync_wait_msg) self.object_condition_mp.wait() + logging.debug(sync_begin_debug_msg) self.object_locked_pids_mp.append(pid) else: with self.object_condition: while pid in self.object_locked_pids: - logging.debug( - "FileHashStore - store_object: pid (%s) is locked. Waiting.", - pid, - ) + logging.debug(sync_wait_msg) self.object_condition.wait() + logging.debug(sync_begin_debug_msg) self.object_locked_pids.append(pid) try: logging.debug( @@ -525,21 +527,18 @@ def store_object( logging.error(exception_string) raise err finally: + end_sync_debug_msg = ( + f"FileHashStore - store_object: Releasing pid ({pid}) from locked list" + ) if use_multiprocessing: with self.object_condition_mp: - logging.debug( - "FileHashStore - store_object (mp): Removing pid: %s from lock array", - pid, - ) + logging.debug(end_sync_debug_msg) self.object_locked_pids_mp.remove(pid) self.object_condition_mp.notify() else: # Release pid with self.object_condition: - logging.debug( - "FileHashStore - store_object: Removing pid: %s from lock array", - pid, - ) + logging.debug(end_sync_debug_msg) self.object_locked_pids.remove(pid) self.object_condition.notify() From 80152625700f770f1e4cdf0ac2ca6ad77f6ec5f3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 13:25:58 -0700 Subject: [PATCH 235/420] Clean up 'tag_object' method --- src/hashstore/filehashstore.py | 44 ++++++++++++++++------------------ 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2c41fb9d..c15d4f92 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -465,7 +465,6 @@ def store_object( additional_algorithm, checksum, checksum_algorithm ) - # Wait for the pid to release if it's in use use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" sync_begin_debug_msg = ( f"FileHashStore - store_object: Adding pid ({pid}) to locked list." @@ -475,9 +474,11 @@ def store_object( ) if use_multiprocessing: with self.object_condition_mp: + # Wait for the pid to release if it's in use while pid in self.object_locked_pids_mp: logging.debug(sync_wait_msg) self.object_condition_mp.wait() + # Modify object_locked_pids consecutively logging.debug(sync_begin_debug_msg) self.object_locked_pids_mp.append(pid) else: @@ -527,8 +528,10 @@ def store_object( logging.error(exception_string) raise err finally: + # Release pid end_sync_debug_msg = ( - f"FileHashStore - store_object: Releasing pid ({pid}) from locked list" + f"FileHashStore - store_object: Releasing pid ({pid})" + + " from locked list" ) if use_multiprocessing: with self.object_condition_mp: @@ -599,28 +602,26 @@ def tag_object(self, pid, cid): self._check_string(pid, "pid", "tag_object") self._check_string(cid, "cid", "tag_object") - # Modify reference_locked_cids consecutively - # Wait for the cid to release if it's being tagged use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + sync_begin_debug_msg = ( + f"FileHashStore - tag_object: Adding cid ({pid}) to locked list." + ) + sync_wait_msg = f"FileHashStore - tag_object: Cid ({cid}) is locked. Waiting." if use_multiprocessing: with self.reference_condition_mp: + # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids_mp: - logging.debug( - "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", - cid, - ) + logging.debug(sync_wait_msg) self.reference_condition_mp.wait() - # Add cid to tracking array + # Modify reference_locked_cids consecutively + logging.debug(sync_begin_debug_msg) self.reference_locked_cids_mp.append(cid) else: with self.reference_condition: while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", - cid, - ) + logging.debug(sync_wait_msg) self.reference_condition.wait() - # Add cid to tracking array + logging.debug(sync_begin_debug_msg) self.reference_locked_cids.append(cid) try: # Prepare files and paths @@ -732,21 +733,18 @@ def tag_object(self, pid, cid): return True finally: # Release cid + end_sync_debug_msg = ( + f"FileHashStore - tag_object: Releasing cid ({cid}) from" + + " reference_locked_cids." + ) if use_multiprocessing: with self.reference_condition_mp: - logging.debug( - "FileHashStore - tag_object (mp): Removing cid: %s from" - + " reference_locked_cids.", - cid, - ) + logging.debug(end_sync_debug_msg) self.reference_locked_cids_mp.remove(cid) self.reference_condition_mp.notify() else: with self.reference_condition: - logging.debug( - "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", - cid, - ) + logging.debug(end_sync_debug_msg) self.reference_locked_cids.remove(cid) self.reference_condition.notify() From 1d0a0762a662fadb9084b3df8c0b6684721785a0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 13:29:48 -0700 Subject: [PATCH 236/420] Clean up 'store_metadata' method --- src/hashstore/filehashstore.py | 35 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c15d4f92..889422f8 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -813,23 +813,27 @@ def store_metadata(self, pid, metadata, format_id=None): # Wait for the pid to release if it's in use use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + sync_begin_debug_msg = ( + f"FileHashStore - store_metadata: Adding pid ({pid}) to locked list." + ) + sync_wait_msg = ( + f"FileHashStore - store_metadata: Pid ({pid}) is locked. Waiting." + ) if use_multiprocessing: with self.metadata_condition_mp: + # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids_mp: - logging.debug( - "FileHashStore - store_metadata (mp): %s (pid) is locked. Waiting.", - pid, - ) + logging.debug(sync_wait_msg) self.metadata_condition_mp.wait() + # Modify metadata_locked_pids consecutively + logging.debug(sync_begin_debug_msg) self.metadata_locked_pids_mp.append(pid) else: with self.metadata_condition: while pid in self.metadata_locked_pids: - logging.debug( - "FileHashStore - store_metadata: %s (pid) is locked. Waiting.", - pid, - ) + logging.debug(sync_wait_msg) self.metadata_condition.wait() + logging.debug(sync_begin_debug_msg) self.metadata_locked_pids.append(pid) try: @@ -846,20 +850,18 @@ def store_metadata(self, pid, metadata, format_id=None): return metadata_cid finally: # Release pid + end_sync_debug_msg = ( + f"FileHashStore - store_metadata: Releasing pid ({pid})" + + " from locked list" + ) if use_multiprocessing: with self.metadata_condition_mp: - logging.debug( - "FileHashStore - store_metadata (mp): Removing pid: %s from lock array", - pid, - ) + logging.debug(end_sync_debug_msg) self.metadata_locked_pids_mp.remove(pid) self.metadata_condition_mp.notify() else: with self.metadata_condition: - logging.debug( - "FileHashStore - store_metadata: Removing pid: %s from lock array.", - pid, - ) + logging.debug(end_sync_debug_msg) self.metadata_locked_pids.remove(pid) self.metadata_condition.notify() @@ -934,7 +936,6 @@ def delete_object(self, ab_id, id_type=None): # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): cid = ab_id - # Synchronize the cid if use_multiprocessing: with self.reference_condition_mp: From 5742c783c02348c5f97d2cffb35213fb639dbd43 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 13:40:29 -0700 Subject: [PATCH 237/420] Clean up 'delete_object' method --- src/hashstore/filehashstore.py | 126 +++++++++++++++++---------------- 1 file changed, 66 insertions(+), 60 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 889422f8..3c8d2d25 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -936,48 +936,45 @@ def delete_object(self, ab_id, id_type=None): # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): cid = ab_id - # Synchronize the cid + sync_begin_debug_msg = ( + f"FileHashStore - delete_object: Cid ({cid}) to locked list." + ) + sync_wait_msg = ( + f"FileHashStore - delete_object: Cid ({cid}) is locked. Waiting." + ) if use_multiprocessing: with self.reference_condition_mp: + # Wait for the cid to release if it's in use while cid in self.reference_locked_cids_mp: - logging.debug( - "FileHashStore - delete_object: (cid) %s is locked. Waiting", - cid, - ) + logging.debug(sync_wait_msg) self.reference_condition_mp.wait() - # Add cid to tracking array + # Modify reference_locked_cids consecutively + logging.debug(sync_begin_debug_msg) self.reference_locked_cids_mp.append(cid) else: with self.reference_condition: while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - delete_object: (cid) %s is locked. Waiting", - cid, - ) + logging.debug(sync_wait_msg) self.reference_condition.wait() - # Modify reference_locked_cids consecutively + logging.debug(sync_begin_debug_msg) self.reference_locked_cids.append(cid) try: self._delete("objects", cid) finally: # Release cid + end_sync_debug_msg = ( + f"FileHashStore - delete_object: Releasing cid ({cid})" + + " from locked list" + ) if use_multiprocessing: with self.reference_condition_mp: - logging.debug( - "FileHashStore - delete_object: Removing cid: %s from" - + "lock array.", - cid, - ) + logging.debug(end_sync_debug_msg) self.reference_locked_cids_mp.remove(cid) self.reference_condition_mp.notify() else: with self.reference_condition: - logging.debug( - "FileHashStore - delete_object: Removing cid: %s from" - + "lock array.", - cid, - ) + logging.debug(end_sync_debug_msg) self.reference_locked_cids.remove(cid) self.reference_condition.notify() else: @@ -992,24 +989,27 @@ def delete_object(self, ab_id, id_type=None): # Storing and deleting objects are synchronized together # Duplicate store object requests for a pid are rejected, but deleting an object # will wait for a pid to be released if it's found to be in use before proceeding. + sync_begin_debug_msg = ( + f"FileHashStore - delete_object: Pid ({pid}) to locked list." + ) + sync_wait_msg = ( + f"FileHashStore - delete_object: Pid ({pid}) is locked. Waiting." + ) if use_multiprocessing: with self.object_condition_mp: + # Wait for the pid to release if it's in use while pid in self.object_locked_pids_mp: - logging.debug( - "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", - pid, - ) + logging.debug(sync_wait_msg) self.object_condition_mp.wait() + # Modify object_locked_pids consecutively + logging.debug(sync_begin_debug_msg) self.object_locked_pids_mp.append(pid) else: with self.object_condition: while pid in self.object_locked_pids: - logging.debug( - "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", - pid, - ) + logging.debug(sync_wait_msg) self.object_condition.wait() - # Modify object_locked_pids consecutively + logging.debug(sync_begin_debug_msg) self.object_locked_pids.append(pid) try: @@ -1022,15 +1022,29 @@ def delete_object(self, ab_id, id_type=None): # Proceed with next steps - cid has been retrieved without any issues # We must synchronized here based on the `cid` because multiple threads may # try to access the `cid_reference_file` - with self.reference_condition: - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - delete_object: (cid) %s is locked. Waiting", - cid, - ) - self.reference_condition.wait() - # Modify reference_locked_cids consecutively - self.reference_locked_cids.append(cid) + sync_begin_debug_msg = ( + f"FileHashStore - delete_object: Cid ({cid}) to locked list." + ) + sync_wait_msg = ( + f"FileHashStore - delete_object: Cid ({cid}) is locked." + + " Waiting." + ) + if use_multiprocessing: + with self.reference_condition_mp: + # Wait for the cid to release if it's in use + while cid in self.reference_locked_cids_mp: + logging.debug(sync_wait_msg) + self.reference_condition_mp.wait() + # Modify reference_locked_cids consecutively + logging.debug(sync_begin_debug_msg) + self.reference_locked_cids_mp.append(cid) + else: + with self.reference_condition: + while cid in self.reference_locked_cids: + logging.debug(sync_wait_msg) + self.reference_condition.wait() + logging.debug(sync_begin_debug_msg) + self.reference_locked_cids.append(cid) try: cid_ref_abs_path = self._resolve_path("cid", cid) @@ -1075,22 +1089,18 @@ def delete_object(self, ab_id, id_type=None): finally: # Release cid + end_sync_debug_msg = ( + f"FileHashStore - delete_object: Releasing cid ({cid})" + + " from locked list" + ) if use_multiprocessing: with self.reference_condition_mp: - debug_msg = ( - "FileHashStore - delete_object:" - + f" Removing cid: {cid} from reference_locked_cids." - ) - logging.debug(debug_msg) + logging.debug(end_sync_debug_msg) self.reference_locked_cids_mp.remove(cid) self.reference_condition_mp.notify() else: with self.reference_condition: - debug_msg = ( - "FileHashStore - delete_object:" - + f" Removing cid: {cid} from reference_locked_cids." - ) - logging.debug(debug_msg) + logging.debug(end_sync_debug_msg) self.reference_locked_cids.remove(cid) self.reference_condition.notify() @@ -1113,7 +1123,6 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - except CidRefsDoesNotExist: # Delete pid refs file objects_to_delete.append( @@ -1130,7 +1139,6 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - except RefsFileExistsButCidObjMissing: # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) @@ -1156,7 +1164,6 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - except PidNotFoundInCidRefsFile: # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) @@ -1175,20 +1182,19 @@ def delete_object(self, ab_id, id_type=None): return finally: # Release pid + end_sync_debug_msg = ( + f"FileHashStore - delete_object: Releasing pid ({pid})" + + " from locked list" + ) if use_multiprocessing: with self.object_condition_mp: - logging.debug( - "FileHashStore - delete_object: Removing pid: %s from object_locked_pids.", - pid, - ) + logging.debug(end_sync_debug_msg) self.object_locked_pids_mp.remove(pid) self.object_condition_mp.notify() else: + # Release pid with self.object_condition: - logging.debug( - "FileHashStore - delete_object: Removing pid: %s from object_locked_pids.", - pid, - ) + logging.debug(end_sync_debug_msg) self.object_locked_pids.remove(pid) self.object_condition.notify() From 02d2ef7bda81939e9ddc851a69364dea2f3f1be7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 13:46:20 -0700 Subject: [PATCH 238/420] Clean up 'delete_metadata' method --- src/hashstore/filehashstore.py | 39 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3c8d2d25..79edbf6b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -81,7 +81,6 @@ class FileHashStore(HashStore): reference_condition_mp = multiprocessing.Condition(reference_lock_mp) reference_locked_cids_mp = multiprocessing.Manager().list() - # TODO: Review store/delete object/metadata debug messaging for consistency # TODO: Review if there is a better way to retrieve global env 'use_multiprocessing' def __init__(self, properties=None): @@ -1208,23 +1207,27 @@ def delete_metadata(self, pid, format_id=None): # Wait for the pid to release if it's in use use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + sync_begin_debug_msg = ( + f"FileHashStore - delete_metadata: Adding pid ({pid}) to locked list." + ) + sync_wait_msg = ( + f"FileHashStore - delete_metadata: Pid ({pid}) is locked. Waiting." + ) if use_multiprocessing: with self.metadata_condition_mp: + # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids_mp: - logging.debug( - "FileHashStore - store_metadata (mp): %s (pid) is locked. Waiting.", - pid, - ) + logging.debug(sync_wait_msg) self.metadata_condition_mp.wait() + # Modify metadata_locked_pids consecutively + logging.debug(sync_begin_debug_msg) self.metadata_locked_pids_mp.append(pid) else: with self.metadata_condition: while pid in self.metadata_locked_pids: - logging.debug( - "FileHashStore - store_metadata: %s (pid) is locked. Waiting.", - pid, - ) + logging.debug(sync_wait_msg) self.metadata_condition.wait() + logging.debug(sync_begin_debug_msg) self.metadata_locked_pids.append(pid) try: # Get the metadata directory path for the given pid @@ -1243,8 +1246,8 @@ def delete_metadata(self, pid, format_id=None): os.remove(obj) info_string = ( - "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", - pid, + "FileHashStore - delete_metadata: Successfully deleted all metadata" + + f"for pid: {pid}", ) logging.info(info_string) return @@ -1264,20 +1267,18 @@ def delete_metadata(self, pid, format_id=None): return finally: # Release pid + end_sync_debug_msg = ( + f"FileHashStore - delete_metadata: Releasing pid ({pid})" + + " from locked list" + ) if use_multiprocessing: with self.metadata_condition_mp: - logging.debug( - "FileHashStore - store_metadata (mp): Removing pid: %s from lock array", - pid, - ) + logging.debug(end_sync_debug_msg) self.metadata_locked_pids_mp.remove(pid) self.metadata_condition_mp.notify() else: with self.metadata_condition: - logging.debug( - "FileHashStore - store_metadata: Removing pid: %s from lock array.", - pid, - ) + logging.debug(end_sync_debug_msg) self.metadata_locked_pids.remove(pid) self.metadata_condition.notify() From 2faeefbb333efa425c04964e6698c042cdc916fd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 13:56:31 -0700 Subject: [PATCH 239/420] Revise 'hashstoreclient' module to set global env 'USE_MULTIPROCESSING' in init method of HashStoreClient class --- src/hashstore/hashstoreclient.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index decb83fb..7022a521 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -258,6 +258,13 @@ def __init__(self, properties, testflag=None): module_name = "filehashstore" class_name = "FileHashStore" + # Set multiprocessing to true + os.environ["USE_MULTIPROCESSING"] = "True" + use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + logging.info( + "HashStoreClient - use_multiprocessing (bool): %s", use_multiprocessing + ) + # Instance attributes self.hashstore = factory.get_hashstore(module_name, class_name, properties) logging.info("HashStoreClient - HashStore initialized.") @@ -285,15 +292,6 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num, skip_obj_size) ) logging.info(info_msg) - # Test Begin - # Set multiprocessing to true - os.environ["USE_MULTIPROCESSING"] = "True" - use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" - logging.info( - "HashStoreClient - use_multiprocessing (bool): %s", use_multiprocessing - ) - # Test End - # Get list of objects to store from metacat db if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( From f1711fca952a2d4d875856e00ae160312e8a788c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 14:02:11 -0700 Subject: [PATCH 240/420] Refactor 'filehashstore' to check for global env 'USE_MULTIPROCESSING' upon class init --- src/hashstore/filehashstore.py | 39 +++++++++++++++------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 79edbf6b..97cdd250 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -81,9 +81,10 @@ class FileHashStore(HashStore): reference_condition_mp = multiprocessing.Condition(reference_lock_mp) reference_locked_cids_mp = multiprocessing.Manager().list() - # TODO: Review if there is a better way to retrieve global env 'use_multiprocessing' - def __init__(self, properties=None): + # Check to see whether a multiprocessing or threading sync lock should be used + self.use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + # Now check properties if properties: # Validate properties against existing configuration if present checked_properties = self._validate_properties(properties) @@ -464,14 +465,13 @@ def store_object( additional_algorithm, checksum, checksum_algorithm ) - use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" sync_begin_debug_msg = ( f"FileHashStore - store_object: Adding pid ({pid}) to locked list." ) sync_wait_msg = ( f"FileHashStore - store_object: Pid ({pid}) is locked. Waiting." ) - if use_multiprocessing: + if self.use_multiprocessing: with self.object_condition_mp: # Wait for the pid to release if it's in use while pid in self.object_locked_pids_mp: @@ -532,7 +532,7 @@ def store_object( f"FileHashStore - store_object: Releasing pid ({pid})" + " from locked list" ) - if use_multiprocessing: + if self.use_multiprocessing: with self.object_condition_mp: logging.debug(end_sync_debug_msg) self.object_locked_pids_mp.remove(pid) @@ -601,12 +601,11 @@ def tag_object(self, pid, cid): self._check_string(pid, "pid", "tag_object") self._check_string(cid, "cid", "tag_object") - use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" sync_begin_debug_msg = ( f"FileHashStore - tag_object: Adding cid ({pid}) to locked list." ) sync_wait_msg = f"FileHashStore - tag_object: Cid ({cid}) is locked. Waiting." - if use_multiprocessing: + if self.use_multiprocessing: with self.reference_condition_mp: # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids_mp: @@ -736,7 +735,7 @@ def tag_object(self, pid, cid): f"FileHashStore - tag_object: Releasing cid ({cid}) from" + " reference_locked_cids." ) - if use_multiprocessing: + if self.use_multiprocessing: with self.reference_condition_mp: logging.debug(end_sync_debug_msg) self.reference_locked_cids_mp.remove(cid) @@ -810,15 +809,13 @@ def store_metadata(self, pid, metadata, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "store_metadata") self._check_arg_data(metadata) - # Wait for the pid to release if it's in use - use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" sync_begin_debug_msg = ( f"FileHashStore - store_metadata: Adding pid ({pid}) to locked list." ) sync_wait_msg = ( f"FileHashStore - store_metadata: Pid ({pid}) is locked. Waiting." ) - if use_multiprocessing: + if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids_mp: @@ -853,7 +850,7 @@ def store_metadata(self, pid, metadata, format_id=None): f"FileHashStore - store_metadata: Releasing pid ({pid})" + " from locked list" ) - if use_multiprocessing: + if self.use_multiprocessing: with self.metadata_condition_mp: logging.debug(end_sync_debug_msg) self.metadata_locked_pids_mp.remove(pid) @@ -928,7 +925,6 @@ def delete_object(self, ab_id, id_type=None): "FileHashStore - delete_object: Request to delete object for id: %s", ab_id ) self._check_string(ab_id, "ab_id", "delete_object") - use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" if id_type == "cid": cid_refs_abs_path = self._resolve_path("cid", ab_id) @@ -941,7 +937,7 @@ def delete_object(self, ab_id, id_type=None): sync_wait_msg = ( f"FileHashStore - delete_object: Cid ({cid}) is locked. Waiting." ) - if use_multiprocessing: + if self.use_multiprocessing: with self.reference_condition_mp: # Wait for the cid to release if it's in use while cid in self.reference_locked_cids_mp: @@ -966,7 +962,7 @@ def delete_object(self, ab_id, id_type=None): f"FileHashStore - delete_object: Releasing cid ({cid})" + " from locked list" ) - if use_multiprocessing: + if self.use_multiprocessing: with self.reference_condition_mp: logging.debug(end_sync_debug_msg) self.reference_locked_cids_mp.remove(cid) @@ -994,7 +990,7 @@ def delete_object(self, ab_id, id_type=None): sync_wait_msg = ( f"FileHashStore - delete_object: Pid ({pid}) is locked. Waiting." ) - if use_multiprocessing: + if self.use_multiprocessing: with self.object_condition_mp: # Wait for the pid to release if it's in use while pid in self.object_locked_pids_mp: @@ -1028,7 +1024,7 @@ def delete_object(self, ab_id, id_type=None): f"FileHashStore - delete_object: Cid ({cid}) is locked." + " Waiting." ) - if use_multiprocessing: + if self.use_multiprocessing: with self.reference_condition_mp: # Wait for the cid to release if it's in use while cid in self.reference_locked_cids_mp: @@ -1092,7 +1088,7 @@ def delete_object(self, ab_id, id_type=None): f"FileHashStore - delete_object: Releasing cid ({cid})" + " from locked list" ) - if use_multiprocessing: + if self.use_multiprocessing: with self.reference_condition_mp: logging.debug(end_sync_debug_msg) self.reference_locked_cids_mp.remove(cid) @@ -1185,7 +1181,7 @@ def delete_object(self, ab_id, id_type=None): f"FileHashStore - delete_object: Releasing pid ({pid})" + " from locked list" ) - if use_multiprocessing: + if self.use_multiprocessing: with self.object_condition_mp: logging.debug(end_sync_debug_msg) self.object_locked_pids_mp.remove(pid) @@ -1206,14 +1202,13 @@ def delete_metadata(self, pid, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") # Wait for the pid to release if it's in use - use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" sync_begin_debug_msg = ( f"FileHashStore - delete_metadata: Adding pid ({pid}) to locked list." ) sync_wait_msg = ( f"FileHashStore - delete_metadata: Pid ({pid}) is locked. Waiting." ) - if use_multiprocessing: + if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids_mp: @@ -1271,7 +1266,7 @@ def delete_metadata(self, pid, format_id=None): f"FileHashStore - delete_metadata: Releasing pid ({pid})" + " from locked list" ) - if use_multiprocessing: + if self.use_multiprocessing: with self.metadata_condition_mp: logging.debug(end_sync_debug_msg) self.metadata_locked_pids_mp.remove(pid) From c9ee2ea75c3fdf4fecf3187f927226b2cd01392a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 31 May 2024 14:02:58 -0700 Subject: [PATCH 241/420] Remove unused 'time' module and related code --- src/hashstore/filehashstore.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 97cdd250..284913db 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -5,7 +5,6 @@ import multiprocessing import shutil import threading -import time import hashlib import os import logging @@ -59,7 +58,6 @@ class FileHashStore(HashStore): "blake2s", ] # Variables to orchestrate parallelization - time_out_sec = 1 # Thread Synchronization object_lock = threading.Lock() object_condition = threading.Condition(object_lock) From 92cbb2cea8ac6974da8c422d5d319819399005f6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 3 Jun 2024 09:36:27 -0700 Subject: [PATCH 242/420] Update 'README.md' with instructions on how to switch HashStore from threading synchronization to multiprocessing synchronization --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index da7aa829..1aff3ad4 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,19 @@ These reference files are implemented in HashStore underneath the hood with no e └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 ``` +## Concurrency in HashStore + +HashStore is both thread and process safe, and by default synchronizes calls to store & delete objects/metadata with Python's threading module. If you wish to use multiprocessing to parallelize your application, please declare a global environment variable `USE_MULTIPROCESSING` as `True` before initializing Hashstore. This will direct the relevant Public API calls to synchronize using the Python `multiprocessing` module's locks and conditions. Please see below for example: + +```py +# Set the global environment variable +os.environ["USE_MULTIPROCESSING"] = "True" + +# Check that the global environment variable has been set +use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" +``` + + ## Development build HashStore is a python package, and built using the [Python Poetry](https://python-poetry.org) build tool. From f1fd49df9e1884201814b09afa578bfa0b94cc3a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 3 Jun 2024 09:49:08 -0700 Subject: [PATCH 243/420] Revise debug statements in 'tag_object' when a cid refs file is updated --- src/hashstore/filehashstore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 284913db..d2ca5cfd 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -688,8 +688,8 @@ def tag_object(self, pid, cid): logging.debug(debug_msg) elif not os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): debug_msg = ( - f"FileHashStore - tag_object: pid refs file does not exists for pid {pid}" - + f" but cid refs file exists at: {cid_refs_path} for cid: {cid}" + f"FileHashStore - tag_object: pid refs file does not exist for pid {pid}" + + f" but cid refs file found at: {cid_refs_path} for cid: {cid}" ) logging.debug(debug_msg) # Move the pid refs file @@ -703,7 +703,7 @@ def tag_object(self, pid, cid): cid, pid_refs_path, cid_refs_path, - "Pid refs file doesn't exist, but cid refs exists.", + f"Updated existing cid refs file: {cid_refs_path} with pid: {pid}", ) logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", From 5970ed85242df4d3ecfb781561d7c8803ebcf1ad Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 3 Jun 2024 17:29:58 -0700 Subject: [PATCH 244/420] Update 'pyproject.toml' author order --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c3526c4a..16cb428f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "hashstore" version = "1.1.0" description = "HashStore, a hash-based object store for data packages." -authors = ["Matt Jones ", "Dou Mok "] +authors = ["Dou Mok ", "Matt Jones "] readme = "README.md" [tool.poetry.dependencies] From 95e78ae2990c6a0bf0a601b8905e4ea9285655e8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 3 Jun 2024 17:30:26 -0700 Subject: [PATCH 245/420] Add skeleton for 'setup' tools and a test module before swapping it with the 'hashstoreclient' --- setup.py | 12 ++++++++++++ src/hashstore/command_line.py | 10 ++++++++++ 2 files changed, 22 insertions(+) create mode 100644 setup.py create mode 100644 src/hashstore/command_line.py diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..426228de --- /dev/null +++ b/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup, find_packages + +setup( + name="hashstore", + version="1.1", + packages=find_packages(where="src"), + entry_points={ + "console_scripts": [ + "hashstore=hashstore.command_line:main", + ], + }, +) diff --git a/src/hashstore/command_line.py b/src/hashstore/command_line.py new file mode 100644 index 00000000..3eb67605 --- /dev/null +++ b/src/hashstore/command_line.py @@ -0,0 +1,10 @@ +"""Test command line tools""" + + +def main(): + """Method to run if setup is set up properly""" + print("This worked!") + + +if __name__ == "__main__": + main() From 847863b687519e79c48d168a67cefd932eb82208 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 4 Jun 2024 12:13:05 -0700 Subject: [PATCH 246/420] Update 'pyproject.toml' with new section 'tool.poetry.scripts' and add hashstoreclient (via hashstore) as an executable --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 16cb428f..c11c4a14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,9 @@ black = "^22.10.0" pylint = "^2.17.4" pg8000 = "^1.29.8" +[tool.poetry.scripts] +hashstore = "hashstore.hashstoreclient:main" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From 9e20843b5d5ba46aaa5b8d007322d746313f0b3e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 4 Jun 2024 12:14:22 -0700 Subject: [PATCH 247/420] Delete WIP files (setup.py, command_line.py) and revise 'hashstoreclient' module with updated module name for hashstoreclient --- setup.py | 12 ------------ src/hashstore/command_line.py | 10 ---------- src/hashstore/hashstoreclient.py | 3 ++- 3 files changed, 2 insertions(+), 23 deletions(-) delete mode 100644 setup.py delete mode 100644 src/hashstore/command_line.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 426228de..00000000 --- a/setup.py +++ /dev/null @@ -1,12 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name="hashstore", - version="1.1", - packages=find_packages(where="src"), - entry_points={ - "console_scripts": [ - "hashstore=hashstore.command_line:main", - ], - }, -) diff --git a/src/hashstore/command_line.py b/src/hashstore/command_line.py deleted file mode 100644 index 3eb67605..00000000 --- a/src/hashstore/command_line.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Test command line tools""" - - -def main(): - """Method to run if setup is set up properly""" - print("This worked!") - - -if __name__ == "__main__": - main() diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 7022a521..a04ee8e0 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """HashStore Command Line App""" import logging @@ -255,7 +256,7 @@ def __init__(self, properties, testflag=None): factory = HashStoreFactory() # Get HashStore from factory - module_name = "filehashstore" + module_name = "hashstore.filehashstore" class_name = "FileHashStore" # Set multiprocessing to true From 6b2f7188d4892d0baae87286208f3558b6789d66 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 4 Jun 2024 12:17:52 -0700 Subject: [PATCH 248/420] Update 'README.md' section for how to use the hashstore client --- README.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1aff3ad4..b5b33d0f 100644 --- a/README.md +++ b/README.md @@ -262,32 +262,35 @@ Client API Options: How to use HashStore client (command line app) ```sh +# Step 0: Install hashstore via poetry to create an executable script +$ poetry install + # Step 1: Create a HashStore -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" +$ hashstore /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" # Get the checksum of a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -getchecksum -pid=persistent_identifier -algo=SHA-256 +$ hashstore /path/to/store/ -getchecksum -pid=persistent_identifier -algo=SHA-256 # Find an object (returns the content identifier) -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -findobject -pid=persistent_identifier +$ hashstore /path/to/store/ -findobject -pid=persistent_identifier # Store a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storeobject -pid=persistent_identifier -path=/path/to/object +$ hashstore /path/to/store/ -storeobject -pid=persistent_identifier -path=/path/to/object # Store a metadata object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storemetadata -pid=persistent_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 +$ hashstore /path/to/store/ -storemetadata -pid=persistent_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 # Retrieve a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrieveobject -pid=persistent_identifier +$ hashstore /path/to/store/ -retrieveobject -pid=persistent_identifier # Retrieve a metadata object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrievemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ hashstore /path/to/store/ -retrievemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 # Delete a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deleteobject -pid=persistent_identifier +$ hashstore /path/to/store/ -deleteobject -pid=persistent_identifier # Delete a metadata file -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deletemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ hashstore /path/to/store/ -deletemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 ``` ## License From 099b7bfbe8ad40f172d670ac0729fa4954a0426c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 6 Jun 2024 10:02:19 -0700 Subject: [PATCH 249/420] Refactor 'store_metadata' to synchronize based on doc name (hash of the pid+format_id) and update logging statements --- src/hashstore/filehashstore.py | 35 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d2ca5cfd..78e1adef 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -806,57 +806,56 @@ def store_metadata(self, pid, metadata, format_id=None): self._check_string(pid, "pid", "store_metadata") checked_format_id = self._check_arg_format_id(format_id, "store_metadata") self._check_arg_data(metadata) + pid_doc = self._computehash(pid + checked_format_id) sync_begin_debug_msg = ( - f"FileHashStore - store_metadata: Adding pid ({pid}) to locked list." + f"FileHashStore - store_metadata: Adding pid: {pid} to locked list, " + + f"with format_id: {checked_format_id} with doc name: {pid_doc}" ) sync_wait_msg = ( - f"FileHashStore - store_metadata: Pid ({pid}) is locked. Waiting." + f"FileHashStore - store_metadata: Pid: {pid} is locked for format_id:" + + f" {checked_format_id} with doc name: {pid_doc}. Waiting." ) if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use - while pid in self.metadata_locked_pids_mp: + while pid_doc in self.metadata_locked_pids_mp: logging.debug(sync_wait_msg) self.metadata_condition_mp.wait() # Modify metadata_locked_pids consecutively logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids_mp.append(pid) + self.metadata_locked_pids_mp.append(pid_doc) else: with self.metadata_condition: - while pid in self.metadata_locked_pids: + while pid_doc in self.metadata_locked_pids: logging.debug(sync_wait_msg) self.metadata_condition.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids.append(pid) + self.metadata_locked_pids.append(pid_doc) try: - logging.debug( - "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", - pid, - ) metadata_cid = self._put_metadata(metadata, pid, checked_format_id) - - logging.info( - "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", - pid, + info_msg = ( + "FileHashStore - store_metadata: Successfully stored metadata for" + + f" pid: {pid} with format_id: {checked_format_id}" ) + logging.info(info_msg) return metadata_cid finally: # Release pid end_sync_debug_msg = ( - f"FileHashStore - store_metadata: Releasing pid ({pid})" - + " from locked list" + f"FileHashStore - store_metadata: Releasing pid doc ({pid_doc})" + + f" from locked list for pid: {pid} with format_id: {checked_format_id}" ) if self.use_multiprocessing: with self.metadata_condition_mp: logging.debug(end_sync_debug_msg) - self.metadata_locked_pids_mp.remove(pid) + self.metadata_locked_pids_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: with self.metadata_condition: logging.debug(end_sync_debug_msg) - self.metadata_locked_pids.remove(pid) + self.metadata_locked_pids.remove(pid_doc) self.metadata_condition.notify() def retrieve_object(self, pid): From b5e1dc87e39ab10617b27fe19ea0aed82a93b383 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 6 Jun 2024 10:15:21 -0700 Subject: [PATCH 250/420] Refactor '_put_metadata' to receive metadata doc name and update pytest --- src/hashstore/filehashstore.py | 10 +++++----- tests/test_filehashstore.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 78e1adef..83a1ceee 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -834,7 +834,7 @@ def store_metadata(self, pid, metadata, format_id=None): self.metadata_locked_pids.append(pid_doc) try: - metadata_cid = self._put_metadata(metadata, pid, checked_format_id) + metadata_cid = self._put_metadata(metadata, pid, pid_doc) info_msg = ( "FileHashStore - store_metadata: Successfully stored metadata for" + f" pid: {pid} with format_id: {checked_format_id}" @@ -1786,13 +1786,13 @@ def _is_string_in_refs_file(ref_id, refs_file_path): return True return False - def _put_metadata(self, metadata, pid, format_id): + def _put_metadata(self, metadata, pid, metadata_doc_name): """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. - :param str pid: Authority-based identifier. - :param str format_id: Metadata format. :param mixed metadata: String or path to metadata document. + :param str pid: Authority-based identifier. + :param str metadata_doc_name: Metadata document name :return: Address of the metadata document. :rtype: str @@ -1807,7 +1807,7 @@ def _put_metadata(self, metadata, pid, format_id): # Get target and related paths (permanent location) metadata_directory = self._computehash(pid) - metadata_document_name = self._computehash(pid + format_id) + metadata_document_name = metadata_doc_name rel_path = "/".join(self._shard(metadata_directory)) full_path = self._get_store_path("metadata") / rel_path / metadata_document_name diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 88dec6a7..44c14358 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -657,13 +657,13 @@ def test_put_metadata_cid(pids, store): test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): + metadata_document_name = store._computehash(pid + format_id) filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store._put_metadata(syspath, pid, format_id) + metadata_cid = store._put_metadata(syspath, pid, metadata_document_name) # Manually calculate expected path metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(pid + format_id) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name From baf83ea21990f6e8c323063c73efc21ce654e0ac Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 6 Jun 2024 11:00:44 -0700 Subject: [PATCH 251/420] Refactor 'delete_metadata' to sync based on doc name --- src/hashstore/filehashstore.py | 166 +++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 60 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 83a1ceee..6b123396 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1197,82 +1197,128 @@ def delete_metadata(self, pid, format_id=None): ) self._check_string(pid, "pid", "delete_metadata") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") + metadata_directory = self._computehash(pid) + rel_path = "/".join(self._shard(metadata_directory)) - # Wait for the pid to release if it's in use - sync_begin_debug_msg = ( - f"FileHashStore - delete_metadata: Adding pid ({pid}) to locked list." - ) - sync_wait_msg = ( - f"FileHashStore - delete_metadata: Pid ({pid}) is locked. Waiting." - ) - if self.use_multiprocessing: - with self.metadata_condition_mp: - # Wait for the pid to release if it's in use - while pid in self.metadata_locked_pids_mp: - logging.debug(sync_wait_msg) - self.metadata_condition_mp.wait() - # Modify metadata_locked_pids consecutively - logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids_mp.append(pid) - else: - with self.metadata_condition: - while pid in self.metadata_locked_pids: - logging.debug(sync_wait_msg) - self.metadata_condition.wait() - logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids.append(pid) - try: - # Get the metadata directory path for the given pid - entity = "metadata" - metadata_directory = self._computehash(pid) - rel_path = "/".join(self._shard(metadata_directory)) + if format_id is None: + # Delete all metadata documents + objects_to_delete = [] + # Retrieve all metadata doc names metadata_rel_path = self._get_store_path("metadata") / rel_path - if format_id is None: - # Delete all metadata files - objects_to_delete = [] - metadata_file_paths = self._get_file_paths(metadata_rel_path) - if metadata_file_paths is not None: - for path in metadata_file_paths: + metadata_file_paths = self._get_file_paths(metadata_rel_path) + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Get document name + pid_doc = os.path.basename(path) + # Synchronize based on doc name + # Wait for the pid to release if it's in use + sync_begin_debug_msg = ( + f"FileHashStore - delete_metadata: Adding pid: {pid} to locked list, " + + f"with format_id: {checked_format_id} with doc name: {pid_doc}" + ) + sync_wait_msg = ( + f"FileHashStore - delete_metadata: Pid: {pid} is locked for format_id:" + + f" {checked_format_id} with doc name: {pid_doc}. Waiting." + ) + if self.use_multiprocessing: + with self.metadata_condition_mp: + # Wait for the pid to release if it's in use + while pid in self.metadata_locked_pids_mp: + logging.debug(sync_wait_msg) + self.metadata_condition_mp.wait() + # Modify metadata_locked_pids consecutively + logging.debug(sync_begin_debug_msg) + self.metadata_locked_pids_mp.append(pid) + else: + with self.metadata_condition: + while pid in self.metadata_locked_pids: + logging.debug(sync_wait_msg) + self.metadata_condition.wait() + logging.debug(sync_begin_debug_msg) + self.metadata_locked_pids.append(pid) + try: + # Mark metadata doc for deletion objects_to_delete.append(self._rename_path_for_deletion(path)) + finally: + # Release pid + end_sync_debug_msg = ( + f"FileHashStore - delete_metadata: Releasing pid doc ({pid_doc})" + + f" from locked list for pid: {pid} with format_id:" + + checked_format_id + ) + if self.use_multiprocessing: + with self.metadata_condition_mp: + logging.debug(end_sync_debug_msg) + self.metadata_locked_pids_mp.remove(pid) + self.metadata_condition_mp.notify() + else: + with self.metadata_condition: + logging.debug(end_sync_debug_msg) + self.metadata_locked_pids.remove(pid) + self.metadata_condition.notify() + + # Delete metadata objects for obj in objects_to_delete: os.remove(obj) - info_string = ( "FileHashStore - delete_metadata: Successfully deleted all metadata" + f"for pid: {pid}", ) logging.info(info_string) - return + else: + # Delete a specific metadata file + entity = "metadata" + pid_doc = self._computehash(pid + checked_format_id) + # Wait for the pid to release if it's in use + sync_begin_debug_msg = ( + f"FileHashStore - delete_metadata: Adding pid: {pid} to locked list, " + + f"with format_id: {checked_format_id} with doc name: {pid_doc}" + ) + sync_wait_msg = ( + f"FileHashStore - delete_metadata: Pid: {pid} is locked for format_id:" + + f" {checked_format_id} with doc name: {pid_doc}. Waiting." + ) + if self.use_multiprocessing: + with self.metadata_condition_mp: + # Wait for the pid to release if it's in use + while pid in self.metadata_locked_pids_mp: + logging.debug(sync_wait_msg) + self.metadata_condition_mp.wait() + # Modify metadata_locked_pids consecutively + logging.debug(sync_begin_debug_msg) + self.metadata_locked_pids_mp.append(pid) else: - # Delete a specific metadata file - metadata_document_name = self._computehash(pid + checked_format_id) - full_path_without_directory = rel_path + "/" + metadata_document_name - metadata_exists = self._exists(entity, full_path_without_directory) - if metadata_exists: - self._delete(entity, full_path_without_directory) - + with self.metadata_condition: + while pid in self.metadata_locked_pids: + logging.debug(sync_wait_msg) + self.metadata_condition.wait() + logging.debug(sync_begin_debug_msg) + self.metadata_locked_pids.append(pid) + try: + full_path_without_directory = rel_path + "/" + pid_doc + self._delete(entity, full_path_without_directory) info_string = ( "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" + f" {pid} for format_id: {format_id}" ) logging.info(info_string) - return - finally: - # Release pid - end_sync_debug_msg = ( - f"FileHashStore - delete_metadata: Releasing pid ({pid})" - + " from locked list" - ) - if self.use_multiprocessing: - with self.metadata_condition_mp: - logging.debug(end_sync_debug_msg) - self.metadata_locked_pids_mp.remove(pid) - self.metadata_condition_mp.notify() - else: - with self.metadata_condition: - logging.debug(end_sync_debug_msg) - self.metadata_locked_pids.remove(pid) - self.metadata_condition.notify() + finally: + # Release pid + end_sync_debug_msg = ( + f"FileHashStore - delete_metadata: Releasing pid doc ({pid_doc})" + + f" from locked list for pid: {pid} with format_id:" + + checked_format_id + ) + if self.use_multiprocessing: + with self.metadata_condition_mp: + logging.debug(end_sync_debug_msg) + self.metadata_locked_pids_mp.remove(pid) + self.metadata_condition_mp.notify() + else: + with self.metadata_condition: + logging.debug(end_sync_debug_msg) + self.metadata_locked_pids.remove(pid) + self.metadata_condition.notify() def get_hex_digest(self, pid, algorithm): logging.debug( From 6a9cc2b691ecbf48624c0dd89630bf440644e5dd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 6 Jun 2024 11:15:52 -0700 Subject: [PATCH 252/420] Refactor 'delete_object' to call 'delete_metadata' to remove metadata docs instead of manually removal --- src/hashstore/filehashstore.py | 44 +++++++--------------------------- 1 file changed, 9 insertions(+), 35 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6b123396..bdda05c3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -973,10 +973,6 @@ def delete_object(self, ab_id, id_type=None): # id_type is "pid" pid = ab_id objects_to_delete = [] - # Get the metadata documents to minimize time spent in synchronization - rel_path = "/".join(self._shard(self._computehash(pid))) - metadata_rel_path = self._get_store_path("metadata") / rel_path - metadata_file_paths = self._get_file_paths(metadata_rel_path) # Storing and deleting objects are synchronized together # Duplicate store object requests for a pid are rejected, but deleting an object @@ -1061,17 +1057,13 @@ def delete_object(self, ab_id, id_type=None): objects_to_delete.append( self._rename_path_for_deletion(obj_real_path) ) - # Remove metadata files if they exist - if metadata_file_paths is not None: - for path in metadata_file_paths: - # Rename files by appending _delete to the file name - objects_to_delete.append( - self._rename_path_for_deletion(path) - ) # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) + # Remove metadata files if they exist + self.delete_metadata(pid) + info_string = ( "FileHashStore - delete_object: Successfully deleted references," + f" metadata and object associated with pid: {pid}" @@ -1105,12 +1097,8 @@ def delete_object(self, ab_id, id_type=None): logging.warning(warn_msg) # Remove metadata files if they exist - if metadata_file_paths is not None: - for path in metadata_file_paths: - # Rename files by appending _delete to the file name - objects_to_delete.append( - self._rename_path_for_deletion(path) - ) + self.delete_metadata(pid) + # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) @@ -1121,12 +1109,7 @@ def delete_object(self, ab_id, id_type=None): self._rename_path_for_deletion(self._resolve_path("pid", pid)) ) # Remove metadata files if they exist - if metadata_file_paths is not None: - for path in metadata_file_paths: - # Rename files by appending _delete to the file name - objects_to_delete.append( - self._rename_path_for_deletion(path) - ) + self.delete_metadata(pid) # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) @@ -1146,12 +1129,7 @@ def delete_object(self, ab_id, id_type=None): if self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "remove") # Remove metadata files if they exist - if metadata_file_paths is not None: - for path in metadata_file_paths: - # Rename files by appending _delete to the file name - objects_to_delete.append( - self._rename_path_for_deletion(path) - ) + self.delete_metadata(pid) # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) @@ -1162,12 +1140,8 @@ def delete_object(self, ab_id, id_type=None): objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) - if metadata_file_paths is not None: - for path in metadata_file_paths: - # Rename files by appending _delete to the file name - objects_to_delete.append( - self._rename_path_for_deletion(path) - ) + # Remove metadata files if they exist + self.delete_metadata(pid) # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) From 987bc69623640a79be6e31094783f7f37b71b276 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 6 Jun 2024 11:43:25 -0700 Subject: [PATCH 253/420] Rename 'metadata_locked_pids' to 'metadata_locked_docs' to improve clarity --- src/hashstore/filehashstore.py | 46 +++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bdda05c3..9cd1b4fd 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -64,7 +64,7 @@ class FileHashStore(HashStore): object_locked_pids = [] metadata_lock = threading.Lock() metadata_condition = threading.Condition(metadata_lock) - metadata_locked_pids = [] + metadata_locked_docs = [] reference_lock = threading.Lock() reference_condition = threading.Condition(reference_lock) reference_locked_cids = [] @@ -74,7 +74,7 @@ class FileHashStore(HashStore): object_locked_pids_mp = multiprocessing.Manager().list() metadata_lock_mp = multiprocessing.Lock() metadata_condition_mp = multiprocessing.Condition(metadata_lock_mp) - metadata_locked_pids_mp = multiprocessing.Manager().list() + metadata_locked_docs_mp = multiprocessing.Manager().list() reference_lock_mp = multiprocessing.Lock() reference_condition_mp = multiprocessing.Condition(reference_lock_mp) reference_locked_cids_mp = multiprocessing.Manager().list() @@ -819,19 +819,19 @@ def store_metadata(self, pid, metadata, format_id=None): if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use - while pid_doc in self.metadata_locked_pids_mp: + while pid_doc in self.metadata_locked_docs_mp: logging.debug(sync_wait_msg) self.metadata_condition_mp.wait() - # Modify metadata_locked_pids consecutively + # Modify metadata_locked_docs consecutively logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids_mp.append(pid_doc) + self.metadata_locked_docs_mp.append(pid_doc) else: with self.metadata_condition: - while pid_doc in self.metadata_locked_pids: + while pid_doc in self.metadata_locked_docs: logging.debug(sync_wait_msg) self.metadata_condition.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids.append(pid_doc) + self.metadata_locked_docs.append(pid_doc) try: metadata_cid = self._put_metadata(metadata, pid, pid_doc) @@ -850,12 +850,12 @@ def store_metadata(self, pid, metadata, format_id=None): if self.use_multiprocessing: with self.metadata_condition_mp: logging.debug(end_sync_debug_msg) - self.metadata_locked_pids_mp.remove(pid_doc) + self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: with self.metadata_condition: logging.debug(end_sync_debug_msg) - self.metadata_locked_pids.remove(pid_doc) + self.metadata_locked_docs.remove(pid_doc) self.metadata_condition.notify() def retrieve_object(self, pid): @@ -1197,19 +1197,19 @@ def delete_metadata(self, pid, format_id=None): if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use - while pid in self.metadata_locked_pids_mp: + while pid in self.metadata_locked_docs_mp: logging.debug(sync_wait_msg) self.metadata_condition_mp.wait() - # Modify metadata_locked_pids consecutively + # Modify metadata_locked_docs consecutively logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids_mp.append(pid) + self.metadata_locked_docs_mp.append(pid) else: with self.metadata_condition: - while pid in self.metadata_locked_pids: + while pid in self.metadata_locked_docs: logging.debug(sync_wait_msg) self.metadata_condition.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids.append(pid) + self.metadata_locked_docs.append(pid) try: # Mark metadata doc for deletion objects_to_delete.append(self._rename_path_for_deletion(path)) @@ -1223,12 +1223,12 @@ def delete_metadata(self, pid, format_id=None): if self.use_multiprocessing: with self.metadata_condition_mp: logging.debug(end_sync_debug_msg) - self.metadata_locked_pids_mp.remove(pid) + self.metadata_locked_docs_mp.remove(pid) self.metadata_condition_mp.notify() else: with self.metadata_condition: logging.debug(end_sync_debug_msg) - self.metadata_locked_pids.remove(pid) + self.metadata_locked_docs.remove(pid) self.metadata_condition.notify() # Delete metadata objects @@ -1255,19 +1255,19 @@ def delete_metadata(self, pid, format_id=None): if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use - while pid in self.metadata_locked_pids_mp: + while pid in self.metadata_locked_docs_mp: logging.debug(sync_wait_msg) self.metadata_condition_mp.wait() - # Modify metadata_locked_pids consecutively + # Modify metadata_locked_docs consecutively logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids_mp.append(pid) + self.metadata_locked_docs_mp.append(pid) else: with self.metadata_condition: - while pid in self.metadata_locked_pids: + while pid in self.metadata_locked_docs: logging.debug(sync_wait_msg) self.metadata_condition.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_pids.append(pid) + self.metadata_locked_docs.append(pid) try: full_path_without_directory = rel_path + "/" + pid_doc self._delete(entity, full_path_without_directory) @@ -1286,12 +1286,12 @@ def delete_metadata(self, pid, format_id=None): if self.use_multiprocessing: with self.metadata_condition_mp: logging.debug(end_sync_debug_msg) - self.metadata_locked_pids_mp.remove(pid) + self.metadata_locked_docs_mp.remove(pid) self.metadata_condition_mp.notify() else: with self.metadata_condition: logging.debug(end_sync_debug_msg) - self.metadata_locked_pids.remove(pid) + self.metadata_locked_docs.remove(pid) self.metadata_condition.notify() def get_hex_digest(self, pid, algorithm): From 597a1ff00dadad83387e5a009218ccb1f5a8d3d2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 6 Jun 2024 11:46:22 -0700 Subject: [PATCH 254/420] Fix bug in 'delete_metadata' where sync value was not updated to 'pid_doc' --- src/hashstore/filehashstore.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9cd1b4fd..1b5bb7f0 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1202,14 +1202,14 @@ def delete_metadata(self, pid, format_id=None): self.metadata_condition_mp.wait() # Modify metadata_locked_docs consecutively logging.debug(sync_begin_debug_msg) - self.metadata_locked_docs_mp.append(pid) + self.metadata_locked_docs_mp.append(pid_doc) else: with self.metadata_condition: while pid in self.metadata_locked_docs: logging.debug(sync_wait_msg) self.metadata_condition.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_docs.append(pid) + self.metadata_locked_docs.append(pid_doc) try: # Mark metadata doc for deletion objects_to_delete.append(self._rename_path_for_deletion(path)) @@ -1223,12 +1223,12 @@ def delete_metadata(self, pid, format_id=None): if self.use_multiprocessing: with self.metadata_condition_mp: logging.debug(end_sync_debug_msg) - self.metadata_locked_docs_mp.remove(pid) + self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: with self.metadata_condition: logging.debug(end_sync_debug_msg) - self.metadata_locked_docs.remove(pid) + self.metadata_locked_docs.remove(pid_doc) self.metadata_condition.notify() # Delete metadata objects @@ -1260,14 +1260,14 @@ def delete_metadata(self, pid, format_id=None): self.metadata_condition_mp.wait() # Modify metadata_locked_docs consecutively logging.debug(sync_begin_debug_msg) - self.metadata_locked_docs_mp.append(pid) + self.metadata_locked_docs_mp.append(pid_doc) else: with self.metadata_condition: while pid in self.metadata_locked_docs: logging.debug(sync_wait_msg) self.metadata_condition.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_docs.append(pid) + self.metadata_locked_docs.append(pid_doc) try: full_path_without_directory = rel_path + "/" + pid_doc self._delete(entity, full_path_without_directory) @@ -1286,12 +1286,12 @@ def delete_metadata(self, pid, format_id=None): if self.use_multiprocessing: with self.metadata_condition_mp: logging.debug(end_sync_debug_msg) - self.metadata_locked_docs_mp.remove(pid) + self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: with self.metadata_condition: logging.debug(end_sync_debug_msg) - self.metadata_locked_docs.remove(pid) + self.metadata_locked_docs.remove(pid_doc) self.metadata_condition.notify() def get_hex_digest(self, pid, algorithm): From 229a11adbde64dd1dcee0e2fb96bd74a73eb373c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 6 Jun 2024 12:42:07 -0700 Subject: [PATCH 255/420] Fix linting errors related to unused variables and typos --- src/hashstore/filehashstore.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1b5bb7f0..2a158ba7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -213,7 +213,6 @@ def _write_properties(self, properties): ] # Standardize algorithm value for cross-language compatibility - checked_store_algorithm = None # Note, this must be declared here because HashStore has not yet been initialized accepted_store_algorithms = ["MD5", "SHA-1", "SHA-256", "SHA-384", "SHA-512"] if store_algorithm in accepted_store_algorithms: @@ -1008,7 +1007,7 @@ def delete_object(self, ab_id, id_type=None): cid = self.find_object(pid) # Proceed with next steps - cid has been retrieved without any issues - # We must synchronized here based on the `cid` because multiple threads may + # We must synchronize here based on the `cid` because multiple threads may # try to access the `cid_reference_file` sync_begin_debug_msg = ( f"FileHashStore - delete_object: Cid ({cid}) to locked list." @@ -1571,7 +1570,7 @@ def _move_and_get_checksums( logging.debug(exception_string) raise PidObjectMetadataError(exception_string) from ve finally: - # Delete the temporary file, it already exists so it is redundant + # Delete the temporary file, it already exists, so it is redundant # No exception is thrown so 'store_object' can proceed to tag object self._delete(entity, tmp_file_name) @@ -2113,7 +2112,6 @@ def _check_arg_format_id(self, format_id, method): :return: Valid metadata namespace. :rtype: str """ - checked_format_id = None if format_id and not format_id.strip(): exception_string = f"FileHashStore - {method}: Format_id cannot be empty." logging.error(exception_string) @@ -2432,7 +2430,6 @@ def _count(self, entity): :rtype: int """ count = 0 - directory_to_count = "" if entity == "objects": directory_to_count = self.objects elif entity == "metadata": From aadaefc971915b185fc7097c8e4b7e09cc3d5dc4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 11 Jun 2024 09:42:09 -0700 Subject: [PATCH 256/420] Update 'HashStoreClient' class to use different module names based on 'knbvm' test flag --- src/hashstore/hashstoreclient.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index a04ee8e0..86a93985 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -256,7 +256,10 @@ def __init__(self, properties, testflag=None): factory = HashStoreFactory() # Get HashStore from factory - module_name = "hashstore.filehashstore" + if testflag is "knbvm": + module_name = "filehashstore" + else: + module_name = "hashstore.filehashstore" class_name = "FileHashStore" # Set multiprocessing to true From c761585ec744f84d6538942596aeb4f5141852b6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 09:55:16 -0700 Subject: [PATCH 257/420] Update 'hashstore' interface for 'verify_object' - no longer returns boolean --- src/hashstore/hashstore.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 4e249018..a0940d52 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -88,8 +88,6 @@ def verify_object( :param str checksum: Value of the checksum. :param str checksum_algorithm: Algorithm of the checksum. :param int expected_file_size: Size of the temporary file. - - :return: bool - `True` if valid """ raise NotImplementedError() From eb73b51b20fb4ba59e59588a332ceabb1ff6b354 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 10:00:56 -0700 Subject: [PATCH 258/420] Refactor '_verify_object_information' to throw new custom exception 'NonMatchingObjSize' and update pytests --- src/hashstore/filehashstore.py | 51 +++++++++++++------------- tests/test_filehashstore.py | 6 +-- tests/test_filehashstore_interface.py | 3 +- tests/test_filehashstore_references.py | 8 ++-- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2a158ba7..e9707d0a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -565,29 +565,21 @@ def verify_object( object_metadata_file_size = object_metadata.obj_size checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) - try: - self._verify_object_information( - pid=None, - checksum=checksum, - checksum_algorithm=checksum_algorithm_checked, - entity="objects", - hex_digests=object_metadata_hex_digests, - tmp_file_name=None, - tmp_file_size=object_metadata_file_size, - file_size_to_validate=expected_file_size, - ) - logging.info( - "FileHashStore - verify_object: object has been validated for cid: %s", - object_metadata.cid, - ) - return True - # pylint: disable=W0718 - except Exception as err: - exception_string = ( - f"FileHashStore - verify_object: object not valid: {err}." - ) - logging.info(exception_string) - return False + # Throws exceptions if there's an issue + self._verify_object_information( + pid=None, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + entity="objects", + hex_digests=object_metadata_hex_digests, + tmp_file_name=None, + tmp_file_size=object_metadata_file_size, + file_size_to_validate=expected_file_size, + ) + logging.info( + "FileHashStore - verify_object: object has been validated for cid: %s", + object_metadata.cid, + ) def tag_object(self, pid, cid): logging.debug( @@ -1929,10 +1921,10 @@ def _verify_object_information( + f" Tmp file deleted and file not stored for pid: {pid}" ) logging.debug(exception_string_for_pid) - raise ValueError(exception_string_for_pid) + raise NonMatchingObjSize(exception_string_for_pid) else: logging.debug(exception_string) - raise ValueError(exception_string) + raise NonMatchingObjSize(exception_string) if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: exception_string = ( @@ -2612,3 +2604,12 @@ class PidNotFoundInCidRefsFile(Exception): def __init__(self, message, errors=None): super().__init__(message) self.errors = errors + + +class NonMatchingObjSize(Exception): + """Custom exception thrown when pid reference file exists with a cid, but + the respective cid reference file does not contain the pid.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 44c14358..faac041e 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -4,7 +4,7 @@ import os from pathlib import Path import pytest -from hashstore.filehashstore import FileHashStore +from hashstore.filehashstore import FileHashStore, NonMatchingObjSize # pylint: disable=W0212 @@ -457,7 +457,7 @@ def test_move_and_get_checksums_incorrect_file_size(pids, store): """Test move and get checksum raises error with an incorrect file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): - with pytest.raises(ValueError): + with pytest.raises(NonMatchingObjSize): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") incorrect_file_size = 1000 @@ -719,7 +719,7 @@ def test_verify_object_information_incorrect_size(pids, store): hex_digests = object_metadata.hex_digests checksum = hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm - with pytest.raises(ValueError): + with pytest.raises(NonMatchingObjSize): # pylint: disable=W0212 store._verify_object_information( None, diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 83586ff5..40ebed93 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -11,6 +11,7 @@ from hashstore.filehashstore import ( CidRefsDoesNotExist, + NonMatchingObjSize, PidObjectMetadataError, PidNotFoundInCidRefsFile, PidRefsDoesNotExist, @@ -448,7 +449,7 @@ def test_store_object_with_obj_file_size_incorrect(store, pids): for pid in pids.keys(): obj_file_size = 1234 path = test_dir + pid.replace("/", "_") - with pytest.raises(ValueError): + with pytest.raises(NonMatchingObjSize): store.store_object(pid, path, expected_object_size=obj_file_size) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 22a4c7ba..61e320fb 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -4,7 +4,7 @@ import shutil import pytest -from hashstore.filehashstore import PidAlreadyExistsError +from hashstore.filehashstore import NonMatchingObjSize, PidAlreadyExistsError # pylint: disable=W0212 @@ -184,10 +184,8 @@ def test_verify_object_exception_incorrect_size(pids, store): checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm - is_valid = store.verify_object( - object_metadata, checksum, checksum_algorithm, 1000 - ) - assert not is_valid + with pytest.raises(NonMatchingObjSize): + store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] From b75f05e4f10eefda8a3b7abad9317045614dfc35 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 10:18:24 -0700 Subject: [PATCH 259/420] Delete redundant code from 'store_object', refactor '_move_and_get_checksums' to catch and re-raise custom exceptions, delete no longer used custom exception 'PidObjectMetadatError' and update pytests --- src/hashstore/filehashstore.py | 55 +++++++++++++------------- tests/test_filehashstore.py | 10 +++-- tests/test_filehashstore_interface.py | 3 +- tests/test_filehashstore_references.py | 14 ++++--- 4 files changed, 46 insertions(+), 36 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e9707d0a..5cdc77a8 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -507,14 +507,6 @@ def store_object( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, ) - except PidObjectMetadataError as ome: - exception_string = ( - f"FileHashStore - store_object: failed to store object for pid: {pid}." - + " Reference files will not be created or tagged. PidObjectMetadataError: " - + str(ome) - ) - logging.error(exception_string) - raise ome except Exception as err: exception_string = ( f"FileHashStore - store_object: failed to store object for pid: {pid}." @@ -1540,7 +1532,7 @@ def _move_and_get_checksums( logging.warning("FileHashStore - _move_and_get_checksums: %s", err_msg) raise else: - # If the file exists, determine if the object is what the client states it to be + # If the data object already exists, do not move the file but attempt to verify it try: self._verify_object_information( pid, @@ -1552,17 +1544,26 @@ def _move_and_get_checksums( tmp_file_size, file_size_to_validate, ) - except ValueError as ve: - # If any exception is thrown during validation, + except NonMatchingObjSize as nmose: + # If any exception is thrown during validation, we do not tag. + exception_string = ( + f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" + + " , deleting temp file. Reference files will not be created and/or tagged" + + f" due to an issue with the supplied pid object metadata. {str(nmose)}" + ) + logging.debug(exception_string) + raise NonMatchingObjSize(exception_string) from nmose + except NonMatchingChecksum as nmce: + # If any exception is thrown during validation, we do not tag. exception_string = ( f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" + " , deleting temp file. Reference files will not be created and/or tagged" - + f" due to an issue with the supplied pid object metadata. {ve}" + + f" due to an issue with the supplied pid object metadata. {str(nmce)}" ) logging.debug(exception_string) - raise PidObjectMetadataError(exception_string) from ve + raise NonMatchingChecksum(exception_string) from nmce finally: - # Delete the temporary file, it already exists, so it is redundant + # Delete the temporary file, the data object already exists, so it is redundant # No exception is thrown so 'store_object' can proceed to tag object self._delete(entity, tmp_file_name) @@ -1949,14 +1950,14 @@ def _verify_object_information( exception_string + f" Tmp file ({tmp_file_name}) deleted." ) logging.debug(exception_string_for_pid) - raise ValueError(exception_string_for_pid) + raise NonMatchingChecksum(exception_string_for_pid) else: # Delete the object cid = hex_digests[self.algorithm] cid_abs_path = self._resolve_path("cid", cid) self._delete(entity, cid_abs_path) logging.debug(exception_string) - raise ValueError(exception_string) + raise NonMatchingChecksum(exception_string) def _verify_hashstore_references( self, @@ -2563,15 +2564,6 @@ def __init__(self, message, errors=None): self.errors = errors -class PidObjectMetadataError(Exception): - """Custom exception thrown when an object cannot be verified due - to an error with the metadata provided to validate against.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - class PidRefsDoesNotExist(Exception): """Custom exception thrown when a pid refs file does not exist.""" @@ -2607,8 +2599,17 @@ def __init__(self, message, errors=None): class NonMatchingObjSize(Exception): - """Custom exception thrown when pid reference file exists with a cid, but - the respective cid reference file does not contain the pid.""" + """Custom exception thrown when verifying an object and the expected file size + does not match what has been calculated.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class NonMatchingChecksum(Exception): + """Custom exception thrown when verifying an object and the expected checksum + does not match what has been calculated.""" def __init__(self, message, errors=None): super().__init__(message) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index faac041e..ad7afcb0 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -4,7 +4,11 @@ import os from pathlib import Path import pytest -from hashstore.filehashstore import FileHashStore, NonMatchingObjSize +from hashstore.filehashstore import ( + FileHashStore, + NonMatchingChecksum, + NonMatchingObjSize, +) # pylint: disable=W0212 @@ -322,7 +326,7 @@ def test_store_and_validate_data_with_incorrect_checksum(pids, store): algo = "sha224" algo_checksum = "badChecksumValue" path = test_dir + pid.replace("/", "_") - with pytest.raises(ValueError): + with pytest.raises(NonMatchingChecksum): store._store_and_validate_data( pid, path, checksum=algo_checksum, checksum_algorithm=algo ) @@ -441,7 +445,7 @@ def test_move_and_get_checksums_raises_error_with_nonmatching_checksum(pids, sto for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - with pytest.raises(ValueError): + with pytest.raises(NonMatchingChecksum): # pylint: disable=W0212 store._move_and_get_checksums( pid, diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 40ebed93..f0c6b4b5 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -11,6 +11,7 @@ from hashstore.filehashstore import ( CidRefsDoesNotExist, + NonMatchingChecksum, NonMatchingObjSize, PidObjectMetadataError, PidNotFoundInCidRefsFile, @@ -422,7 +423,7 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor # Store first blob _object_metadata_one = store.store_object(pid, path) # Store second blob - with pytest.raises(PidObjectMetadataError): + with pytest.raises(NonMatchingChecksum): _object_metadata_two = store.store_object( pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 61e320fb..3d1bc24e 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -4,7 +4,11 @@ import shutil import pytest -from hashstore.filehashstore import NonMatchingObjSize, PidAlreadyExistsError +from hashstore.filehashstore import ( + NonMatchingChecksum, + NonMatchingObjSize, + PidAlreadyExistsError, +) # pylint: disable=W0212 @@ -204,10 +208,10 @@ def test_verify_object_exception_incorrect_checksum(pids, store): checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size - is_valid = store.verify_object( - object_metadata, "abc123", checksum_algorithm, expected_file_size - ) - assert not is_valid + with pytest.raises(NonMatchingChecksum): + is_valid = store.verify_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] From c2878ca21c22b39f04b27dcd4c68a63bc69c60de Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 10:19:44 -0700 Subject: [PATCH 260/420] Removed 'PidObjectMetadataError' import from 'test_filehashstore_interface' module --- tests/test_filehashstore_interface.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index f0c6b4b5..27e0cf83 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -13,7 +13,6 @@ CidRefsDoesNotExist, NonMatchingChecksum, NonMatchingObjSize, - PidObjectMetadataError, PidNotFoundInCidRefsFile, PidRefsDoesNotExist, RefsFileExistsButCidObjMissing, From 4f3e198ceb612d0a0cafcb45741f3d17456af24a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 10:32:56 -0700 Subject: [PATCH 261/420] Revise '_clean_algorithm' to throw new custom exception 'UnsupportedAlgorithm' to improve clarity, and add new pytest --- src/hashstore/filehashstore.py | 14 +++++++++++++- tests/test_filehashstore.py | 5 +++-- tests/test_filehashstore_interface.py | 22 +++++++++++++++++++--- tests/test_filehashstore_references.py | 3 ++- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5cdc77a8..7377111a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1928,6 +1928,9 @@ def _verify_object_information( raise NonMatchingObjSize(exception_string) if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: + # Check to see if it is a supported algorithm + self._clean_algorithm(checksum_algorithm) + # TODO: If so, calculate the checksum and compare it exception_string = ( "FileHashStore - _verify_object_information: checksum_algorithm" + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." @@ -2176,7 +2179,7 @@ def _clean_algorithm(self, algorithm_string): + cleaned_string ) logging.error(exception_string) - raise ValueError(exception_string) + raise UnsupportedAlgorithm(exception_string) return cleaned_string def _computehash(self, stream, algorithm=None): @@ -2614,3 +2617,12 @@ class NonMatchingChecksum(Exception): def __init__(self, message, errors=None): super().__init__(message) self.errors = errors + + +class UnsupportedAlgorithm(Exception): + """Custom exception thrown when a given algorithm is not supported in HashStore for + calculating hashes/checksums, as the default store algo and/or other operations.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index ad7afcb0..aac5c0d1 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -8,6 +8,7 @@ FileHashStore, NonMatchingChecksum, NonMatchingObjSize, + UnsupportedAlgorithm, ) # pylint: disable=W0212 @@ -608,12 +609,12 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") algo = "md2" - with pytest.raises(ValueError): + with pytest.raises(UnsupportedAlgorithm): # pylint: disable=W0212 _, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=algo ) - with pytest.raises(ValueError): + with pytest.raises(UnsupportedAlgorithm): # pylint: disable=W0212 _, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, checksum_algorithm=algo diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 27e0cf83..7d3b558c 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -16,6 +16,7 @@ PidNotFoundInCidRefsFile, PidRefsDoesNotExist, RefsFileExistsButCidObjMissing, + UnsupportedAlgorithm, ) # pylint: disable=W0212 @@ -178,7 +179,7 @@ def test_store_object_additional_algorithm_invalid(store): pid = "jtao.1700.1" path = test_dir + pid algorithm_not_in_list = "abc" - with pytest.raises(ValueError, match="Algorithm not supported"): + with pytest.raises(UnsupportedAlgorithm): store.store_object(pid, path, algorithm_not_in_list) @@ -345,11 +346,26 @@ def test_store_object_checksum_incorrect_checksum(store): test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid + algorithm_other = "sha224" + checksum_incorrect = ( + "bbbb069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + with pytest.raises(NonMatchingChecksum): + store.store_object( + pid, path, checksum=checksum_incorrect, checksum_algorithm=algorithm_other + ) + + +def test_store_object_checksum_unsupported_checksum_algo(store): + """Test store object raises error when supplied with unsupported checksum algo.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid algorithm_other = "sha3_256" checksum_incorrect = ( "bbbb069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - with pytest.raises(ValueError): + with pytest.raises(UnsupportedAlgorithm): store.store_object( pid, path, checksum=algorithm_other, checksum_algorithm=checksum_incorrect ) @@ -1304,7 +1320,7 @@ def test_get_hex_digest_pid_unsupported_algorithm(store): syspath.read_bytes() _object_metadata = store.store_object(pid, path) algorithm = "sm3" - with pytest.raises(ValueError): + with pytest.raises(UnsupportedAlgorithm): store.get_hex_digest(pid, algorithm) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 3d1bc24e..e993ad21 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -8,6 +8,7 @@ NonMatchingChecksum, NonMatchingObjSize, PidAlreadyExistsError, + UnsupportedAlgorithm, ) # pylint: disable=W0212 @@ -227,7 +228,7 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): object_metadata = store.store_object(data=path) checksum = object_metadata.hex_digests.get(store.algorithm) expected_file_size = object_metadata.obj_size - with pytest.raises(ValueError): + with pytest.raises(UnsupportedAlgorithm): store.verify_object(object_metadata, checksum, "md2", expected_file_size) From 9d405e87a92207cc26a10687655c475da223a8dd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 11:10:32 -0700 Subject: [PATCH 262/420] Implement to-do item in 'verify_object' when called with an algo to verify against that is not found in the default list, and update pytests --- src/hashstore/filehashstore.py | 29 ++++++++++++++++----- tests/test_filehashstore.py | 36 +++++++++++++++++++++++--- tests/test_filehashstore_references.py | 35 ++++++++++++++++++++----- 3 files changed, 83 insertions(+), 17 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7377111a..50c19cd9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1930,13 +1930,28 @@ def _verify_object_information( if checksum_algorithm not in hex_digests: # Check to see if it is a supported algorithm self._clean_algorithm(checksum_algorithm) - # TODO: If so, calculate the checksum and compare it - exception_string = ( - "FileHashStore - _verify_object_information: checksum_algorithm" - + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." - ) - logging.debug(exception_string) - raise KeyError(exception_string) + # If so, calculate the checksum and compare it + if tmp_file_name is not None and pid is not None: + # Calculate the checksum from the tmp file + hex_digest_calculated = self._computehash( + tmp_file_name, algorithm=checksum_algorithm + ) + else: + # Otherwise, a data object has been stored without a pid + object_cid = hex_digests[self.algorithm] + cid_stream = self._open(entity, object_cid) + hex_digest_calculated = self._computehash( + cid_stream, algorithm=checksum_algorithm + ) + if hex_digest_calculated != checksum: + exception_string = ( + "FileHashStore - _verify_object_information: checksum_algorithm" + + f" ({checksum_algorithm}) cannot be found in the default hex digests" + + " dict, but is supported. New checksum calculated but does not match" + + " what has been provided." + ) + logging.debug(exception_string) + raise NonMatchingChecksum(exception_string) else: hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum.lower(): diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index aac5c0d1..11a29ea2 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -768,23 +768,51 @@ def test_verify_object_information_incorrect_size_with_pid(pids, store): assert not os.path.isfile(tmp_file.name) -def test_verify_object_information_missing_key_in_hex_digests(pids, store): +def test_verify_object_information_missing_key_in_hex_digests_unsupported_algo( + pids, store +): """Test _verify_object_information throws exception when algorithm is not found - in hex digests.""" + in hex digests and is not supported.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = "blake2s" + checksum_algorithm = "md10" expected_file_size = object_metadata.obj_size - with pytest.raises(KeyError): + with pytest.raises(UnsupportedAlgorithm): # pylint: disable=W0212 store._verify_object_information( None, checksum, checksum_algorithm, + "objects", + object_metadata.hex_digests, + None, + expected_file_size, + expected_file_size, + ) + + +def test_verify_object_information_missing_key_in_hex_digests_supported_algo( + pids, store +): + """Test _verify_object_information throws exception when algorithm is not found + in hex digests but is supported, however the checksum calculated does not match.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = "blake2s" + expected_file_size = object_metadata.obj_size + with pytest.raises(NonMatchingChecksum): + # pylint: disable=W0212 + store._verify_object_information( None, + checksum, + checksum_algorithm, + "objects", object_metadata.hex_digests, None, expected_file_size, diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index e993ad21..ea5d1a4d 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -151,7 +151,7 @@ def test_tag_object_pid_refs_not_found_cid_refs_found(store): def test_verify_object(pids, store): - """Test verify_object succeeds given good arguments.""" + """Test verify_object does not throw exception given good arguments.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -164,8 +164,19 @@ def test_verify_object(pids, store): ) +def test_verify_object_supported_other_algo_not_in_default(pids, store): + """Test verify_object throws exception when incorrect algorithm is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = pids[pid]["sha224"] + expected_file_size = object_metadata.obj_size + store.verify_object(object_metadata, checksum, "sha224", expected_file_size) + + def test_verify_object_exception_incorrect_object_metadata_type(pids, store): - """Test verify_object returns false when incorrect object is given to + """Test verify_object throws exception when incorrect object is given to object_metadata arg.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -181,7 +192,7 @@ def test_verify_object_exception_incorrect_object_metadata_type(pids, store): def test_verify_object_exception_incorrect_size(pids, store): - """Test verify_object returns false when incorrect size is supplied.""" + """Test verify_object throws exception when incorrect size is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -199,7 +210,7 @@ def test_verify_object_exception_incorrect_size(pids, store): def test_verify_object_exception_incorrect_checksum(pids, store): - """Test verify_object returns false when incorrect checksum is supplied.""" + """Test verify_object throws exception when incorrect checksum is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -210,7 +221,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): expected_file_size = object_metadata.obj_size with pytest.raises(NonMatchingChecksum): - is_valid = store.verify_object( + store.verify_object( object_metadata, "abc123", checksum_algorithm, expected_file_size ) @@ -221,7 +232,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): def test_verify_object_exception_incorrect_checksum_algo(pids, store): - """Test verify_object returns false when incorrect algorithm is supplied.""" + """Test verify_object throws exception when unsupported algorithm is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -232,6 +243,18 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): store.verify_object(object_metadata, checksum, "md2", expected_file_size) +def test_verify_object_exception_supported_other_algo_bad_checksum(pids, store): + """Test verify_object throws exception when incorrect algorithm is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(NonMatchingChecksum): + store.verify_object(object_metadata, checksum, "sha224", expected_file_size) + + def test_write_refs_file_ref_type_cid(store): """Test that write_refs_file writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" From 669740425725273155d74f93c106012d47c0cb2c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 11:15:57 -0700 Subject: [PATCH 263/420] Fix minor bug in 'hashstoreclient' when checking for knbvm testflag --- src/hashstore/hashstoreclient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 86a93985..5a77c5aa 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -256,7 +256,7 @@ def __init__(self, properties, testflag=None): factory = HashStoreFactory() # Get HashStore from factory - if testflag is "knbvm": + if testflag: module_name = "filehashstore" else: module_name = "hashstore.filehashstore" From 2cf9b459011d6273d81e6f335e67474516c6d5f0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 11:38:38 -0700 Subject: [PATCH 264/420] Refactor 'verify_hashstore_references' to throw custom exception classes for improved clarity, and update pytests --- src/hashstore/filehashstore.py | 44 +++++++++++++++++++++++--- tests/test_filehashstore.py | 11 +++++-- tests/test_filehashstore_references.py | 28 +++++++++------- 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 50c19cd9..af021aaa 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2012,7 +2012,7 @@ def _verify_hashstore_references( + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) - raise FileNotFoundError(exception_string) + raise PidRefsFileNotFound(exception_string) if not os.path.exists(cid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " @@ -2020,7 +2020,7 @@ def _verify_hashstore_references( + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) - raise FileNotFoundError(exception_string) + raise CidRefsFileNotFound(exception_string) # Check the content of the reference files # Start with the cid with open(pid_refs_path, "r", encoding="utf8") as f: @@ -2032,7 +2032,7 @@ def _verify_hashstore_references( + f" Additional Context: {additional_log_string}" ) logging.error(exception_string) - raise ValueError(exception_string) + raise PidRefsContentError(exception_string) # Then the pid pid_found = self._is_string_in_refs_file(pid, cid_refs_path) if not pid_found: @@ -2042,7 +2042,7 @@ def _verify_hashstore_references( + f" Additional Context: {additional_log_string}" ) logging.error(exception_string) - raise ValueError(exception_string) + raise CidRefsContentError(exception_string) @staticmethod def _check_arg_data(data): @@ -2572,6 +2572,42 @@ def close(self): self._obj.seek(self._pos) +class PidRefsFileNotFound(Exception): + """Custom exception thrown when verifying reference files and a pid refs + file is not found.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class CidRefsFileNotFound(Exception): + """Custom exception thrown when verifying reference files and a cid refs + file is not found.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class PidRefsContentError(Exception): + """Custom exception thrown when verifying reference files and a pid refs + file does not contain the cid that it is expected.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class CidRefsContentError(Exception): + """Custom exception thrown when verifying reference files and a cid refs + file does not have a pid that is expected to be found.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + class PidAlreadyExistsError(Exception): """Custom exception thrown when a client calls 'tag_object' and the pid that is being tagged is already accounted for (has a pid refs file and diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 11a29ea2..c1e297ba 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -523,7 +523,7 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(stor path = test_dir + pid input_stream = io.open(path, "rb") additional_algo = "sha224" - additional_algo_checksum = ( + additional_algo_checksum_correct = ( "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" ) checksum_algo = "sha3_256" @@ -538,7 +538,7 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(stor ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct - assert hex_digests.get("sha224") == additional_algo_checksum + assert hex_digests.get("sha224") == additional_algo_checksum_correct def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo_duplicate( @@ -833,6 +833,13 @@ def test_clean_algorithm(store): assert cleaned_algo_other_hyphen == "sha3_256" +def test_clean_algorithm_unsupported_algo(store): + """Check that algorithm values get formatted as expected.""" + algorithm_unsupported = "mok22" + with pytest.raises(UnsupportedAlgorithm): + _ = store._clean_algorithm(algorithm_unsupported) + + def test_computehash(pids, store): """Test to check computehash method.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index ea5d1a4d..e9f6c92d 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -5,9 +5,13 @@ import pytest from hashstore.filehashstore import ( + CidRefsContentError, + CidRefsFileNotFound, NonMatchingChecksum, NonMatchingObjSize, PidAlreadyExistsError, + PidRefsContentError, + PidRefsFileNotFound, UnsupportedAlgorithm, ) @@ -61,7 +65,7 @@ def test_tag_object_pid_refs_file_content(pids, store): def test_tag_object_cid_refs_file_content(pids, store): - """Test tag_object creates the cid reference file successfully with pid.""" + """Test tag_object creates the cid reference file successfully with pid tagged.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -168,15 +172,18 @@ def test_verify_object_supported_other_algo_not_in_default(pids, store): """Test verify_object throws exception when incorrect algorithm is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): + supported_algo = "sha224" path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - checksum = pids[pid]["sha224"] + checksum = pids[pid][supported_algo] expected_file_size = object_metadata.obj_size - store.verify_object(object_metadata, checksum, "sha224", expected_file_size) + store.verify_object( + object_metadata, checksum, supported_algo, expected_file_size + ) def test_verify_object_exception_incorrect_object_metadata_type(pids, store): - """Test verify_object throws exception when incorrect object is given to + """Test verify_object throws exception when incorrect class type is given to object_metadata arg.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -244,7 +251,7 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): def test_verify_object_exception_supported_other_algo_bad_checksum(pids, store): - """Test verify_object throws exception when incorrect algorithm is supplied.""" + """Test verify_object throws exception when incorrect checksum is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -346,7 +353,6 @@ def test_update_refs_file_remove(pids, store): with open(tmp_cid_refs_file, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() - print(value) assert value == pid_other @@ -387,7 +393,7 @@ def test_verify_hashstore_references_pid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when pid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - with pytest.raises(FileNotFoundError): + with pytest.raises(PidRefsFileNotFound): store._verify_hashstore_references(pid, cid) @@ -410,7 +416,7 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - with pytest.raises(ValueError): + with pytest.raises(PidRefsContentError): store._verify_hashstore_references(pid, cid) @@ -424,7 +430,7 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - with pytest.raises(FileNotFoundError): + with pytest.raises(CidRefsFileNotFound): store._verify_hashstore_references(pid, cid) @@ -446,7 +452,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - with pytest.raises(ValueError): + with pytest.raises(CidRefsContentError): store._verify_hashstore_references(pid, cid) @@ -475,5 +481,5 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi store._update_refs_file(cid_ref_abs_path, f"dou.test.{i}", "add") cid_reference_list.append(f"dou.test.{i}") - with pytest.raises(ValueError): + with pytest.raises(CidRefsContentError): store._verify_hashstore_references(pid, cid) From 97cd1e41ff0c343a7927ff07a68575f017697536 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 11:43:52 -0700 Subject: [PATCH 265/420] Create new 'filehashstore_exceptions' module and refactor filehashstore & update pytests --- src/hashstore/filehashstore.py | 121 +++------------------- src/hashstore/filehashstore_exceptions.py | 105 +++++++++++++++++++ tests/test_filehashstore.py | 4 +- tests/test_filehashstore_interface.py | 2 +- tests/test_filehashstore_references.py | 2 +- 5 files changed, 123 insertions(+), 111 deletions(-) create mode 100644 src/hashstore/filehashstore_exceptions.py diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index af021aaa..f701322e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -14,6 +14,20 @@ import fcntl import yaml from hashstore import HashStore, ObjectMetadata +from hashstore.filehashstore_exceptions import ( + CidRefsContentError, + CidRefsDoesNotExist, + CidRefsFileNotFound, + NonMatchingChecksum, + NonMatchingObjSize, + PidAlreadyExistsError, + PidNotFoundInCidRefsFile, + PidRefsContentError, + PidRefsDoesNotExist, + PidRefsFileNotFound, + RefsFileExistsButCidObjMissing, + UnsupportedAlgorithm, +) class FileHashStore(HashStore): @@ -2570,110 +2584,3 @@ def close(self): self._obj.close() else: self._obj.seek(self._pos) - - -class PidRefsFileNotFound(Exception): - """Custom exception thrown when verifying reference files and a pid refs - file is not found.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class CidRefsFileNotFound(Exception): - """Custom exception thrown when verifying reference files and a cid refs - file is not found.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class PidRefsContentError(Exception): - """Custom exception thrown when verifying reference files and a pid refs - file does not contain the cid that it is expected.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class CidRefsContentError(Exception): - """Custom exception thrown when verifying reference files and a cid refs - file does not have a pid that is expected to be found.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class PidAlreadyExistsError(Exception): - """Custom exception thrown when a client calls 'tag_object' and the pid - that is being tagged is already accounted for (has a pid refs file and - is found in the cid refs file).""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class PidRefsDoesNotExist(Exception): - """Custom exception thrown when a pid refs file does not exist.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class CidRefsDoesNotExist(Exception): - """Custom exception thrown when a cid refs file does not exist.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class RefsFileExistsButCidObjMissing(Exception): - """Custom exception thrown when pid and cid refs file exists, but the - cid object does not.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class PidNotFoundInCidRefsFile(Exception): - """Custom exception thrown when pid reference file exists with a cid, but - the respective cid reference file does not contain the pid.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class NonMatchingObjSize(Exception): - """Custom exception thrown when verifying an object and the expected file size - does not match what has been calculated.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class NonMatchingChecksum(Exception): - """Custom exception thrown when verifying an object and the expected checksum - does not match what has been calculated.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class UnsupportedAlgorithm(Exception): - """Custom exception thrown when a given algorithm is not supported in HashStore for - calculating hashes/checksums, as the default store algo and/or other operations.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors diff --git a/src/hashstore/filehashstore_exceptions.py b/src/hashstore/filehashstore_exceptions.py new file mode 100644 index 00000000..d36a9b73 --- /dev/null +++ b/src/hashstore/filehashstore_exceptions.py @@ -0,0 +1,105 @@ +class PidRefsFileNotFound(Exception): + """Custom exception thrown when verifying reference files and a pid refs + file is not found.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class CidRefsFileNotFound(Exception): + """Custom exception thrown when verifying reference files and a cid refs + file is not found.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class PidRefsContentError(Exception): + """Custom exception thrown when verifying reference files and a pid refs + file does not contain the cid that it is expected.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class CidRefsContentError(Exception): + """Custom exception thrown when verifying reference files and a cid refs + file does not have a pid that is expected to be found.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class PidAlreadyExistsError(Exception): + """Custom exception thrown when a client calls 'tag_object' and the pid + that is being tagged is already accounted for (has a pid refs file and + is found in the cid refs file).""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class PidRefsDoesNotExist(Exception): + """Custom exception thrown when a pid refs file does not exist.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class CidRefsDoesNotExist(Exception): + """Custom exception thrown when a cid refs file does not exist.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class RefsFileExistsButCidObjMissing(Exception): + """Custom exception thrown when pid and cid refs file exists, but the + cid object does not.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class PidNotFoundInCidRefsFile(Exception): + """Custom exception thrown when pid reference file exists with a cid, but + the respective cid reference file does not contain the pid.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class NonMatchingObjSize(Exception): + """Custom exception thrown when verifying an object and the expected file size + does not match what has been calculated.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class NonMatchingChecksum(Exception): + """Custom exception thrown when verifying an object and the expected checksum + does not match what has been calculated.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + +class UnsupportedAlgorithm(Exception): + """Custom exception thrown when a given algorithm is not supported in HashStore for + calculating hashes/checksums, as the default store algo and/or other operations.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index c1e297ba..58972df7 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -4,8 +4,8 @@ import os from pathlib import Path import pytest -from hashstore.filehashstore import ( - FileHashStore, +from hashstore.filehashstore import FileHashStore +from hashstore.filehashstore_exceptions import ( NonMatchingChecksum, NonMatchingObjSize, UnsupportedAlgorithm, diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 7d3b558c..c777e880 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -9,7 +9,7 @@ import time import pytest -from hashstore.filehashstore import ( +from hashstore.filehashstore_exceptions import ( CidRefsDoesNotExist, NonMatchingChecksum, NonMatchingObjSize, diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index e9f6c92d..a1664bc5 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -4,7 +4,7 @@ import shutil import pytest -from hashstore.filehashstore import ( +from hashstore.filehashstore_exceptions import ( CidRefsContentError, CidRefsFileNotFound, NonMatchingChecksum, From 1adadc548581f892ab79dddd56b1dfda7a2140ce Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 11:45:56 -0700 Subject: [PATCH 266/420] Reorganize 'filehashstore_exceptions' module and add missing class module docstring --- src/hashstore/filehashstore_exceptions.py | 49 ++++++++++++----------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/src/hashstore/filehashstore_exceptions.py b/src/hashstore/filehashstore_exceptions.py index d36a9b73..c81f9c55 100644 --- a/src/hashstore/filehashstore_exceptions.py +++ b/src/hashstore/filehashstore_exceptions.py @@ -1,6 +1,9 @@ -class PidRefsFileNotFound(Exception): - """Custom exception thrown when verifying reference files and a pid refs - file is not found.""" +"""FileHashStore custom exception module.""" + + +class CidRefsContentError(Exception): + """Custom exception thrown when verifying reference files and a cid refs + file does not have a pid that is expected to be found.""" def __init__(self, message, errors=None): super().__init__(message) @@ -16,6 +19,14 @@ def __init__(self, message, errors=None): self.errors = errors +class CidRefsDoesNotExist(Exception): + """Custom exception thrown when a cid refs file does not exist.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + class PidRefsContentError(Exception): """Custom exception thrown when verifying reference files and a pid refs file does not contain the cid that it is expected.""" @@ -25,9 +36,9 @@ def __init__(self, message, errors=None): self.errors = errors -class CidRefsContentError(Exception): - """Custom exception thrown when verifying reference files and a cid refs - file does not have a pid that is expected to be found.""" +class PidRefsFileNotFound(Exception): + """Custom exception thrown when verifying reference files and a pid refs + file is not found.""" def __init__(self, message, errors=None): super().__init__(message) @@ -52,23 +63,6 @@ def __init__(self, message, errors=None): self.errors = errors -class CidRefsDoesNotExist(Exception): - """Custom exception thrown when a cid refs file does not exist.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - -class RefsFileExistsButCidObjMissing(Exception): - """Custom exception thrown when pid and cid refs file exists, but the - cid object does not.""" - - def __init__(self, message, errors=None): - super().__init__(message) - self.errors = errors - - class PidNotFoundInCidRefsFile(Exception): """Custom exception thrown when pid reference file exists with a cid, but the respective cid reference file does not contain the pid.""" @@ -96,6 +90,15 @@ def __init__(self, message, errors=None): self.errors = errors +class RefsFileExistsButCidObjMissing(Exception): + """Custom exception thrown when pid and cid refs file exists, but the + cid object does not.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + class UnsupportedAlgorithm(Exception): """Custom exception thrown when a given algorithm is not supported in HashStore for calculating hashes/checksums, as the default store algo and/or other operations.""" From 777be91f1656274438fd97ce852f3373263011aa Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jun 2024 12:36:30 -0700 Subject: [PATCH 267/420] Cleanup 'test_filehashstore_interface' module and add pytest --- tests/test_filehashstore_interface.py | 44 +++++++++++++++++++-------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index c777e880..2fb4c80b 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -184,7 +184,7 @@ def test_store_object_additional_algorithm_invalid(store): def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): - """Test store object formats an additional algorithm in uppercase.""" + """Test store object accepts an additional algo that's supported in uppercase.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -197,7 +197,7 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): - """Test store object formats an with additional algorithm in lowercase.""" + """Test store object accepts an with additional algo that's supported in lowercase.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -213,7 +213,7 @@ def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): def test_store_object_additional_algorithm_underscore(pids, store): - """Test store object with formats an additional algorithm with underscore.""" + """Test store object accepts an additional algo that's supported with underscore.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -269,7 +269,7 @@ def test_store_object_checksum_correct_and_additional_algo(store): def test_store_object_checksum_correct_and_additional_algo_duplicate(store): - """Test store object does not throw exception with duplicate algorithms.""" + """Test store object does not throw exception with duplicate algorithms (de-dupes).""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -407,8 +407,8 @@ def test_store_object_duplicate_object_references_file_count(store): def test_store_object_duplicate_object_references_file_content(pids, store): - """Test that storing duplicate object but different pid creates the expected - amount of reference files.""" + """Test that storing duplicate object but different pid updates the cid refs file + with the correct amount of pids.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -429,8 +429,8 @@ def test_store_object_duplicate_object_references_file_content(pids, store): def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, store): - """Test store duplicate object throws ValueError when object exists - but the data to validate against is incorrect.""" + """Test store duplicate object throws exception when the data to validate against + is incorrect.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -460,7 +460,7 @@ def test_store_object_with_obj_file_size(store, pids): def test_store_object_with_obj_file_size_incorrect(store, pids): - """Test store object throws exception with incorrect file sizes.""" + """Test store object throws exception with incorrect file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): obj_file_size = 1234 @@ -470,7 +470,7 @@ def test_store_object_with_obj_file_size_incorrect(store, pids): def test_store_object_with_obj_file_size_non_integer(store, pids): - """Test store object throws exception with a non integer value (ex. a stirng) + """Test store object throws exception with a non integer value (ex. a string) as the file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -561,8 +561,8 @@ def store_object_wrapper(obj_pid, obj_path): with open(cid_refs_path, "r", encoding="utf8") as ref_file: # Confirm that pid is not currently already tagged for pid in ref_file: - number_of_pids_reffed += 1 - assert pid.strip() in pid_list + if pid.strip() in pid_list: + number_of_pids_reffed += 1 assert number_of_pids_reffed == 6 @@ -1102,7 +1102,7 @@ def test_delete_object_metadata_deleted(pids, store): assert store._count("metadata") == 0 -def test_delete_object_refs_files_deleted(pids, store): +def test_delete_object_all_refs_files_deleted(pids, store): """Test delete_object successfully deletes refs files.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -1238,6 +1238,24 @@ def test_delete_metadata_one_pid_multiple_metadata_documents(store): assert store._count(entity) == 0 +def test_delete_metadata_specific_pid_multiple_metadata_documents(store): + """Test delete_metadata for a pid with multiple metadata documents deletes + only the specified metadata file.""" + test_dir = "tests/testdata/" + entity = "metadata" + pid = "jtao.1700.1" + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + format_id = "http://ns.dataone.org/service/types/v2.0" + format_id3 = "http://ns.dataone.org/service/types/v3.0" + format_id4 = "http://ns.dataone.org/service/types/v4.0" + _metadata_cid = store.store_metadata(pid, syspath, format_id) + _metadata_cid3 = store.store_metadata(pid, syspath, format_id3) + _metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + store.delete_metadata(pid, format_id4) + assert store._count(entity) == 2 + + def test_delete_metadata_does_not_exist(pids, store): """Test delete_metadata does not throw exception when called to delete metadata that does not exist.""" From 18cb8bfa91448edc5c851761e4c58b9c857b2ef3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 24 Jun 2024 09:54:37 -0700 Subject: [PATCH 268/420] Update README.md and filehashstore class docstring for accuracy --- README.md | 4 ++-- src/hashstore/filehashstore.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b5b33d0f..be68594d 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ - Contact us: support@dataone.org - [DataONE discussions](https://github.com/DataONEorg/dataone/discussions) -HashStore is a server-side python package implementing a content-based identifier file system for storing and accessing data and metadata for DataONE services. The package is used in DataONE system components that need direct, filesystem-based access to data objects, their system metadata, and extended metadata about the objects. This package is a core component of the [DataONE federation](https://dataone.org), and supports large-scale object storage for a variety of repositories, including the [KNB Data Repository](http://knb.ecoinformatics.org), the [NSF Arctic Data Center](https://arcticdata.io/catalog/), the [DataONE search service](https://search.dataone.org), and other repositories. +HashStore is a server-side python package providing persistent file-based storage using content hashes to de-duplicate data for storing and accessing data and metadata for DataONE services. The package is used in DataONE system components that need direct, filesystem-based access to data objects, their system metadata, and extended metadata about the objects. This package is a core component of the [DataONE federation](https://dataone.org), and supports large-scale object storage for a variety of repositories, including the [KNB Data Repository](http://knb.ecoinformatics.org), the [NSF Arctic Data Center](https://arcticdata.io/catalog/), the [DataONE search service](https://search.dataone.org), and other repositories. DataONE in general, and HashStore in particular, are open source, community projects. We [welcome contributions](https://github.com/DataONEorg/hashstore/blob/main/CONTRIBUTING.md) in many forms, including code, graphics, documentation, bug reports, testing, etc. Use the [DataONE discussions](https://github.com/DataONEorg/dataone/discussions) to discuss these contributions with us. @@ -18,7 +18,7 @@ Documentation is a work in progress, and can be found on the [Metacat repository ## HashStore Overview -HashStore is a content-addressable file management system that utilizes the content identifier of an object to address files. The system stores objects, references (refs) and metadata in its respective directories and provides an API for interacting with the store. HashStore storage classes (like `FileHashStore`) must implement the HashStore interface to ensure the expected usage of HashStore. +HashStore is an object storage system that stores data objects based on the their content identifiers. The system stores objects, references (refs) and metadata in its respective directories and provides a content identifier-based API for interacting with the HashStore. HashStore storage classes (like `FileHashStore`) must implement the HashStore interface to ensure the expected usage of HashStore. ###### Public API Methods - store_object diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f701322e..ac9d471e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -31,9 +31,10 @@ class FileHashStore(HashStore): - """FileHashStore is a content-addressable file manager based on Derrick - Gilland's 'hashfs' library. It supports the storage of objects on disk using - a content identifier to address files. + """FileHashStore is an object storage system that was extended from Derrick Gilland's + 'hashfs' library. It supports the storage of objects on disk using a content identifier + to address files (data objects are de-duplicated) and provides a content identifier-based + API to interact with a HashStore. FileHashStore initializes using a given properties dictionary containing the required keys (see Args). Upon initialization, FileHashStore verifies the provided From 95e7baf0fd36925870fecc887705117e9eed356b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 24 Jun 2024 17:29:16 -0700 Subject: [PATCH 269/420] Update store directory used in tests to end with 'hashstore' to reduce confusion as to where a hashstore should exist --- tests/conftest.py | 5 +++-- tests/test_filehashstore.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 54af3542..86ec51f1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ """Pytest overall configuration file for fixtures""" + import pytest from hashstore.filehashstore import FileHashStore @@ -16,8 +17,8 @@ def pytest_addoption(parser): @pytest.fixture(name="props") def init_props(tmp_path): """Properties to initialize HashStore.""" - directory = tmp_path / "metacat" - directory.mkdir() + directory = tmp_path / "metacat" / "hashstore" + directory.mkdir(parents=True) hashstore_path = directory.as_posix() # Note, objects generated via tests are placed in a temporary folder # with the 'directory' parameter above appended diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 58972df7..ad8bdf3d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -856,7 +856,7 @@ def test_get_store_path_object(store): # pylint: disable=W0212 path_objects = store._get_store_path("objects") path_objects_string = str(path_objects) - assert path_objects_string.endswith("/metacat/objects") + assert path_objects_string.endswith("/metacat/hashstore/objects") def test_get_store_path_metadata(store): @@ -864,7 +864,7 @@ def test_get_store_path_metadata(store): # pylint: disable=W0212 path_metadata = store._get_store_path("metadata") path_metadata_string = str(path_metadata) - assert path_metadata_string.endswith("/metacat/metadata") + assert path_metadata_string.endswith("/metacat/hashstore/metadata") def test_get_store_path_refs(store): @@ -872,7 +872,7 @@ def test_get_store_path_refs(store): # pylint: disable=W0212 path_metadata = store._get_store_path("refs") path_metadata_string = str(path_metadata) - assert path_metadata_string.endswith("/metacat/refs") + assert path_metadata_string.endswith("/metacat/hashstore/refs") def test_exists_object_with_object_metadata_id(pids, store): From f0274e45b43b81b8c62c75c10c37ab715357b74a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 08:25:30 -0700 Subject: [PATCH 270/420] Revise 'FileHashStore' init process to check for specific hashstore related subdirs instead of all subdirs before throwing an exception, and update pytests --- src/hashstore/filehashstore.py | 15 ++++++++++----- tests/test_filehashstore.py | 4 ++-- tests/test_hashstore.py | 5 +++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ac9d471e..2fea64fb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -355,14 +355,19 @@ def _verify_hashstore_properties(self, properties, prop_store_path): else: if os.path.exists(prop_store_path): # Check if HashStore exists and throw exception if found - if any(Path(prop_store_path).iterdir()): + subfolders = ["objects", "metadata", "refs"] + if any( + os.path.isdir(os.path.join(prop_store_path, sub)) + for sub in subfolders + ): exception_string = ( - "FileHashStore - HashStore directories and/or objects found at:" - + f" {prop_store_path} but missing configuration file at: " - + self.hashstore_configuration_yaml + "FileHashStore - Unable to initialize HashStore. `hashstore.yaml` is not" + + " present but conflicting HashStore directory exists. Please delete" + + " '/objects', '/metadata' and/or '/refs' at the store path or supply" + + " a new path." ) logging.critical(exception_string) - raise FileNotFoundError(exception_string) + raise RuntimeError(exception_string) def _validate_properties(self, properties): """Validate a properties dictionary by checking if it contains all the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index ad8bdf3d..7987986a 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -116,7 +116,7 @@ def test_init_with_existing_hashstore_mismatched_config_metadata_ns(store): def test_init_with_existing_hashstore_missing_yaml(store, pids): - """Test init with existing store raises FileNotFoundError when hashstore.yaml + """Test init with existing store raises RuntimeError when hashstore.yaml not found but objects exist.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -130,7 +130,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): "store_algorithm": "SHA-256", "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } - with pytest.raises(FileNotFoundError): + with pytest.raises(RuntimeError): FileHashStore(properties) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index eb79caa8..baf74dbb 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,4 +1,5 @@ """Test module for HashStore's HashStoreFactory and ObjectMetadata class.""" + import os import pytest from hashstore.hashstore import ObjectMetadata, HashStoreFactory @@ -49,7 +50,7 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): class_name = "FileHashStore" properties = { - "store_path": os.getcwd() + "/metacat/test", + "store_path": os.getcwd() + "/metacat/hashstore", "store_depth": 3, "store_width": 2, "store_algorithm": "MD2", @@ -65,7 +66,7 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) class_name = "FileHashStore" properties = { - "store_path": os.getcwd() + "/metacat/test", + "store_path": os.getcwd() + "/metacat/hashstore", "store_depth": 3, "store_width": 2, "store_algorithm": "dou_algo", From 47996d4d020a1dc827a14cd248c2440cf53130ed Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 08:35:09 -0700 Subject: [PATCH 271/420] Add new pytests to check init with conflicting and non-conflicting directories, and update README.md --- README.md | 2 +- tests/test_hashstore.py | 82 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index be68594d..f739867d 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ How to use HashStore client (command line app) # Step 0: Install hashstore via poetry to create an executable script $ poetry install -# Step 1: Create a HashStore +# Step 1: Create a HashStore at your desired store path (ex. /var/metacat/hashstore) $ hashstore /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" # Get the checksum of a data object diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index baf74dbb..defc4811 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -76,6 +76,88 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) factory.get_hashstore(module_name, class_name, properties) +def test_factory_get_hashstore_filehashstore_conflicting_obj_dir(factory, tmp_path): + """Check factory raises exception when existing `/objects` directory exists.""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + directory = tmp_path / "douhs" / "objects" + directory.mkdir(parents=True) + douhspath = (tmp_path / "douhs").as_posix() + + properties = { + "store_path": douhspath, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(RuntimeError): + factory.get_hashstore(module_name, class_name, properties) + + +def test_factory_get_hashstore_filehashstore_conflicting_metadata_dir( + factory, tmp_path +): + """Check factory raises exception when existing `/metadata` directory exists.""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + directory = tmp_path / "douhs" / "metadata" + directory.mkdir(parents=True) + douhspath = (tmp_path / "douhs").as_posix() + + properties = { + "store_path": douhspath, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(RuntimeError): + factory.get_hashstore(module_name, class_name, properties) + + +def test_factory_get_hashstore_filehashstore_conflicting_refs_dir(factory, tmp_path): + """Check factory raises exception when existing `/refs` directory exists.""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + directory = tmp_path / "douhs" / "refs" + directory.mkdir(parents=True) + douhspath = (tmp_path / "douhs").as_posix() + + properties = { + "store_path": douhspath, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(RuntimeError): + factory.get_hashstore(module_name, class_name, properties) + + +def test_factory_get_hashstore_filehashstore_nonconflicting_dir(factory, tmp_path): + """Check factory does not raise exception when existing non-conflicting directory exists.""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + directory = tmp_path / "douhs" / "other" + directory.mkdir(parents=True) + douhspath = (tmp_path / "douhs").as_posix() + + properties = { + "store_path": douhspath, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + + factory.get_hashstore(module_name, class_name, properties) + + def test_objectmetadata(): """Test ObjectMetadata class returns correct values via dot notation.""" pid = "hashstore" From 8d120e0079ac9d555dfd965a42542ad1d250df28 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 08:37:21 -0700 Subject: [PATCH 272/420] Update old placeholder sysmeta namespace value in tests and examples with actual designated value --- README.md | 6 +-- tests/conftest.py | 2 +- tests/test_filehashstore.py | 28 +++++++------- tests/test_filehashstore_interface.py | 56 +++++++++++++-------------- tests/test_hashstore.py | 12 +++--- tests/test_hashstore_client.py | 6 +-- 6 files changed, 55 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index f739867d..ec3dde38 100644 --- a/README.md +++ b/README.md @@ -278,19 +278,19 @@ $ hashstore /path/to/store/ -findobject -pid=persistent_identifier $ hashstore /path/to/store/ -storeobject -pid=persistent_identifier -path=/path/to/object # Store a metadata object -$ hashstore /path/to/store/ -storemetadata -pid=persistent_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 +$ hashstore /path/to/store/ -storemetadata -pid=persistent_identifier -path=/path/to/metadata/object -formatid=https://ns.dataone.org/service/types/v2.0#SystemMetadata # Retrieve a data object $ hashstore /path/to/store/ -retrieveobject -pid=persistent_identifier # Retrieve a metadata object -$ hashstore /path/to/store/ -retrievemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ hashstore /path/to/store/ -retrievemetadata -pid=persistent_identifier -formatid=https://ns.dataone.org/service/types/v2.0#SystemMetadata # Delete a data object $ hashstore /path/to/store/ -deleteobject -pid=persistent_identifier # Delete a metadata file -$ hashstore /path/to/store/ -deletemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ hashstore /path/to/store/ -deletemetadata -pid=persistent_identifier -formatid=https://ns.dataone.org/service/types/v2.0#SystemMetadata ``` ## License diff --git a/tests/conftest.py b/tests/conftest.py index 86ec51f1..27b1c8fa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,7 +27,7 @@ def init_props(tmp_path): "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } return properties diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 7987986a..6c8ff99c 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -35,7 +35,7 @@ def test_init_existing_store_incorrect_algorithm_format(store): "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(ValueError): FileHashStore(properties) @@ -48,7 +48,7 @@ def test_init_existing_store_correct_algorithm_format(store): "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } hashstore_instance = FileHashStore(properties) assert isinstance(hashstore_instance, FileHashStore) @@ -67,7 +67,7 @@ def test_init_with_existing_hashstore_mismatched_config_depth(store): "store_depth": 1, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(ValueError): FileHashStore(properties) @@ -81,7 +81,7 @@ def test_init_with_existing_hashstore_mismatched_config_width(store): "store_depth": 3, "store_width": 1, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(ValueError): FileHashStore(properties) @@ -95,7 +95,7 @@ def test_init_with_existing_hashstore_mismatched_config_algo(store): "store_depth": 3, "store_width": 1, "store_algorithm": "SHA-512", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(ValueError): FileHashStore(properties) @@ -128,7 +128,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(RuntimeError): FileHashStore(properties) @@ -144,7 +144,7 @@ def test_load_properties(store): assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" assert ( hashstore_yaml_dict.get("store_metadata_namespace") - == "http://ns.dataone.org/service/types/v2.0" + == "https://ns.dataone.org/service/types/v2.0#SystemMetadata" ) @@ -164,7 +164,7 @@ def test_validate_properties(store): "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } # pylint: disable=W0212 assert store._validate_properties(properties) @@ -635,7 +635,7 @@ def test_put_metadata_with_path(pids, store): """Test _put_metadata with path object for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -648,7 +648,7 @@ def test_put_metadata_with_string(pids, store): """Test_put metadata with string for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) @@ -660,7 +660,7 @@ def test_put_metadata_with_string(pids, store): def test_put_metadata_cid(pids, store): """Test put metadata returns correct id.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): metadata_document_name = store._computehash(pid + format_id) filename = pid.replace("/", "_") + ".xml" @@ -901,7 +901,7 @@ def test_exists_metadata_files_path(pids, store): """Test exists works as expected for metadata.""" test_dir = "tests/testdata/" entity = "metadata" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -1001,7 +1001,7 @@ def test_get_real_path_with_metadata_id(store, pids): """Test get_real_path returns absolute path given a metadata id.""" entity = "metadata" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -1068,7 +1068,7 @@ def test_resolve_path_objects(pids, store): def test_resolve_path_metadata(pids, store): """Confirm resolve path returns correct metadata path.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 2fb4c80b..cdee0e79 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -796,7 +796,7 @@ def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" entity = "metadata" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -821,7 +821,7 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): syspath = Path(test_dir) / filename metadata_directory = store._computehash(pid) rel_path = "/".join(store._shard(metadata_directory)) - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" metadata_cid = store.store_metadata(pid, syspath, format_id) @@ -842,7 +842,7 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): def test_store_metadata_default_format_id(pids, store): """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -861,7 +861,7 @@ def test_store_metadata_files_string(pids, store): """Test store metadata with a string object to the metadata.""" test_dir = "tests/testdata/" entity = "metadata" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) @@ -874,7 +874,7 @@ def test_store_metadata_files_input_stream(pids, store): """Test store metadata with an input stream to metadata.""" test_dir = "tests/testdata/" entity = "metadata" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) @@ -887,7 +887,7 @@ def test_store_metadata_files_input_stream(pids, store): def test_store_metadata_pid_empty(store): """Test store metadata raises error with an empty string as the pid.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "" filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) @@ -898,7 +898,7 @@ def test_store_metadata_pid_empty(store): def test_store_metadata_pid_empty_spaces(store): """Test store metadata raises error with empty spaces as the pid.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) @@ -920,7 +920,7 @@ def test_store_metadata_pid_format_id_spaces(store): def test_store_metadata_metadata_empty(store): """Test store metadata raises error with empty spaces as the metadata path.""" pid = "jtao.1700.1" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" syspath_string = " " with pytest.raises(TypeError): store.store_metadata(pid, syspath_string, format_id) @@ -929,7 +929,7 @@ def test_store_metadata_metadata_empty(store): def test_store_metadata_metadata_none(store): """Test store metadata raises error with empty None metadata path.""" pid = "jtao.1700.1" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" syspath_string = None with pytest.raises(TypeError): store.store_metadata(pid, syspath_string, format_id) @@ -938,7 +938,7 @@ def test_store_metadata_metadata_none(store): def test_store_metadata_metadata_path(pids, store): """Test store metadata returns expected path to metadata document.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -953,7 +953,7 @@ def test_store_metadata_thread_lock(store): """Test store metadata thread lock.""" test_dir = "tests/testdata/" entity = "metadata" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" @@ -979,7 +979,7 @@ def test_store_metadata_thread_lock(store): def test_retrieve_object(pids, store): """Test retrieve_object returns a stream to the correct object data.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -1010,7 +1010,7 @@ def test_retrieve_object_pid_invalid(store): def test_retrieve_metadata(store): """Test retrieve_metadata returns a stream to the correct metadata.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" @@ -1042,7 +1042,7 @@ def test_retrieve_metadata_default_format_id(store): def test_retrieve_metadata_bytes_pid_invalid(store): """Test retrieve_metadata raises error when supplied with bad pid.""" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" with pytest.raises(ValueError): @@ -1051,7 +1051,7 @@ def test_retrieve_metadata_bytes_pid_invalid(store): def test_retrieve_metadata_bytes_pid_empty(store): """Test retrieve_metadata raises error when supplied with empty pid.""" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " with pytest.raises(ValueError): store.retrieve_metadata(pid, format_id) @@ -1076,7 +1076,7 @@ def test_retrieve_metadata_format_id_empty_spaces(store): def test_delete_object_object_deleted(pids, store): """Test delete_object successfully deletes object.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -1091,7 +1091,7 @@ def test_delete_object_metadata_deleted(pids, store): """Test delete_object successfully deletes relevant metadata files and refs files.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -1105,7 +1105,7 @@ def test_delete_object_metadata_deleted(pids, store): def test_delete_object_all_refs_files_deleted(pids, store): """Test delete_object successfully deletes refs files.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -1120,7 +1120,7 @@ def test_delete_object_all_refs_files_deleted(pids, store): def test_delete_object_pid_refs_file_deleted(pids, store): """Test delete_object deletes the associated pid refs file for the object.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -1135,7 +1135,7 @@ def test_delete_object_pid_refs_file_deleted(pids, store): def test_delete_object_cid_refs_file_deleted(pids, store): """Test delete_object deletes the associated cid refs file for the object.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -1178,7 +1178,7 @@ def test_delete_object_idtype_cid_refs_file_exists(pids, store): """Test delete_object does not delete object if a cid refs file still exists.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -1209,7 +1209,7 @@ def test_delete_metadata(pids, store): """Test delete_metadata successfully deletes metadata.""" test_dir = "tests/testdata/" entity = "metadata" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" @@ -1228,7 +1228,7 @@ def test_delete_metadata_one_pid_multiple_metadata_documents(store): pid = "jtao.1700.1" filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" _metadata_cid = store.store_metadata(pid, syspath, format_id) @@ -1246,7 +1246,7 @@ def test_delete_metadata_specific_pid_multiple_metadata_documents(store): pid = "jtao.1700.1" filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" _metadata_cid = store.store_metadata(pid, syspath, format_id) @@ -1259,7 +1259,7 @@ def test_delete_metadata_specific_pid_multiple_metadata_documents(store): def test_delete_metadata_does_not_exist(pids, store): """Test delete_metadata does not throw exception when called to delete metadata that does not exist.""" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): store.delete_metadata(pid, format_id) @@ -1280,7 +1280,7 @@ def test_delete_metadata_default_format_id(store, pids): def test_delete_metadata_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " with pytest.raises(ValueError): store.delete_metadata(pid, format_id) @@ -1288,7 +1288,7 @@ def test_delete_metadata_pid_empty(store): def test_delete_metadata_pid_none(store): """Test delete_object raises error when pid is 'None'.""" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = None with pytest.raises(ValueError): store.delete_metadata(pid, format_id) @@ -1305,7 +1305,7 @@ def test_delete_metadata_format_id_empty(store): def test_get_hex_digest(store): """Test get_hex_digest for expected value.""" test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index defc4811..bb2c1ac5 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -54,7 +54,7 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): "store_depth": 3, "store_width": 2, "store_algorithm": "MD2", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(ValueError): factory.get_hashstore(module_name, class_name, properties) @@ -70,7 +70,7 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) "store_depth": 3, "store_width": 2, "store_algorithm": "dou_algo", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(ValueError): factory.get_hashstore(module_name, class_name, properties) @@ -90,7 +90,7 @@ def test_factory_get_hashstore_filehashstore_conflicting_obj_dir(factory, tmp_pa "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(RuntimeError): factory.get_hashstore(module_name, class_name, properties) @@ -112,7 +112,7 @@ def test_factory_get_hashstore_filehashstore_conflicting_metadata_dir( "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(RuntimeError): factory.get_hashstore(module_name, class_name, properties) @@ -132,7 +132,7 @@ def test_factory_get_hashstore_filehashstore_conflicting_refs_dir(factory, tmp_p "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } with pytest.raises(RuntimeError): factory.get_hashstore(module_name, class_name, properties) @@ -152,7 +152,7 @@ def test_factory_get_hashstore_filehashstore_nonconflicting_dir(factory, tmp_pat "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } factory.get_hashstore(module_name, class_name, properties) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index f6db7ed5..48c23663 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -142,7 +142,7 @@ def test_store_metadata(capsys, store, pids): """Test storing metadata to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" - namespace = "http://ns.dataone.org/service/types/v2.0" + namespace = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" entity = "metadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" @@ -223,7 +223,7 @@ def test_retrieve_metadata(capsys, pids, store): """Test retrieving metadata from a HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" - namespace = "http://ns.dataone.org/service/types/v2.0" + namespace = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -293,7 +293,7 @@ def test_delete_metadata(pids, store): """Test deleting metadata from a HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" - namespace = "http://ns.dataone.org/service/types/v2.0" + namespace = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename From 3063944b94e5d3436ec2b76db705e63440f06fa4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 08:49:08 -0700 Subject: [PATCH 273/420] Update 'hashstore' interface for 'find_object' --- src/hashstore/hashstore.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index a0940d52..038d9ec1 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -99,7 +99,12 @@ def find_object(self, pid): :param str pid: Authority-based or persistent identifier of the object. - :return: str - Content identifier of the object. + :return: obj_info_dict (dict): + - cid: content identifier + - cid_object_path: path to the object + - cid_refs_path: path to the cid refs file + - pid_refs_path: path to the pid refs file + - sysmeta_path: path to the sysmeta file """ raise NotImplementedError() From c3faaaa03ef1beaff263044366a3bb33f36e5f31 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 09:23:11 -0700 Subject: [PATCH 274/420] Refactor 'find_object' to return an object_info_dict containing the new required keys-values and update pytests --- src/hashstore/filehashstore.py | 43 ++++++++++++++++++++------- tests/test_filehashstore_interface.py | 6 ++-- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2fea64fb..2f825ed3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -774,14 +774,33 @@ def find_object(self, pid): logging.error(err_msg) raise RefsFileExistsButCidObjMissing(err_msg) else: - return pid_refs_cid + sysmeta_doc_name = self._computehash(pid + self.sysmeta_ns) + metadata_directory = self._computehash(pid) + metadata_rel_path = "/".join(self._shard(metadata_directory)) + sysmeta_full_path = ( + self._get_store_path("metadata") + / metadata_rel_path + / sysmeta_doc_name + ) + obj_info_dict = { + "cid": pid_refs_cid, + "cid_object_path": self._resolve_path( + "objects", pid_refs_cid + ), + "cid_refs_path": cid_ref_abs_path, + "pid_refs_path": pid_ref_abs_path, + "sysmeta_path": ( + sysmeta_full_path + if os.path.isdir(sysmeta_full_path) + else "Does not exist." + ), + } + return obj_info_dict else: # If not, it is an orphan pid refs file err_msg = ( "FileHashStore - find_object: pid refs file exists with cid: " - + pid_refs_cid - + " for pid: " - + pid + + f"{pid_refs_cid} for pid: {pid}" + f", but is missing from cid refs file: {cid_ref_abs_path}" ) logging.error(err_msg) @@ -868,7 +887,8 @@ def retrieve_object(self, pid): ) self._check_string(pid, "pid", "retrieve_object") - object_cid = self.find_object(pid) + object_info_dict = self.find_object(pid) + object_cid = object_info_dict.get("cid") entity = "objects" if object_cid: @@ -1008,7 +1028,8 @@ def delete_object(self, ab_id, id_type=None): # `find_object` which will throw custom exceptions if there is an issue with # the reference files, which help us determine the path to proceed with. try: - cid = self.find_object(pid) + object_info_dict = self.find_object(pid) + cid = object_info_dict.get("cid") # Proceed with next steps - cid has been retrieved without any issues # We must synchronize here based on the `cid` because multiple threads may @@ -1038,8 +1059,8 @@ def delete_object(self, ab_id, id_type=None): self.reference_locked_cids.append(cid) try: - cid_ref_abs_path = self._resolve_path("cid", cid) - pid_ref_abs_path = self._resolve_path("pid", pid) + cid_ref_abs_path = object_info_dict.get("cid_refs_path") + pid_ref_abs_path = object_info_dict.get("pid_refs_path") # Add pid refs file to be permanently deleted objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) @@ -1056,7 +1077,7 @@ def delete_object(self, ab_id, id_type=None): objects_to_delete.append( self._rename_path_for_deletion(cid_ref_abs_path) ) - obj_real_path = self._resolve_path("objects", cid) + obj_real_path = object_info_dict.get("cid_object_path") objects_to_delete.append( self._rename_path_for_deletion(obj_real_path) ) @@ -1095,7 +1116,7 @@ def delete_object(self, ab_id, id_type=None): warn_msg = ( "FileHashStore - delete_object: pid refs file does not exist for pid: " + ab_id - + ". Skipping deletion request." + + ". Skipping object deletion. Deleting pid metadata documents." ) logging.warning(warn_msg) @@ -1307,7 +1328,7 @@ def get_hex_digest(self, pid, algorithm): entity = "objects" algorithm = self._clean_algorithm(algorithm) - object_cid = self.find_object(pid) + object_cid = self.find_object(pid).get("cid") if not self._exists(entity, object_cid): exception_string = ( f"FileHashStore - get_hex_digest: No object found for pid: {pid}" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index cdee0e79..578511ce 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -718,8 +718,8 @@ def test_find_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - cid = store.find_object(pid) - assert cid == object_metadata.hex_digests.get("sha256") + obj_info_dict = store.find_object(pid) + assert obj_info_dict.get("cid") == object_metadata.hex_digests.get("sha256") def test_find_object_refs_exist_but_obj_not_found(pids, store): @@ -729,7 +729,7 @@ def test_find_object_refs_exist_but_obj_not_found(pids, store): path = test_dir + pid.replace("/", "_") store.store_object(pid, path) - cid = store.find_object(pid) + cid = store.find_object(pid).get("cid") obj_path = store._resolve_path("objects", cid) os.remove(obj_path) From 4dfb4f5627d7a65a4c3a32d9550fde704d25928b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 09:38:17 -0700 Subject: [PATCH 275/420] Update 'hashstoreclient's 'find_object' output and update pytest --- src/hashstore/hashstoreclient.py | 13 +++++++++++-- tests/test_hashstore_client.py | 15 ++++++++++++++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 5a77c5aa..4a4fefe3 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -850,8 +850,17 @@ def main(): if pid is None: raise ValueError("'-pid' option is required") # Find the content identifier of the object - cid = hashstore_c.hashstore.find_object(pid) - print(f"Content identifier: {cid}") + object_info_dict = hashstore_c.hashstore.find_object(pid) + cid = object_info_dict.get("cid") + cid_object_path = object_info_dict.get("cid") + cid_refs_path = object_info_dict.get("cid_refs_path") + pid_refs_path = object_info_dict.get("pid_refs_path") + sysmeta_path = object_info_dict.get("sysmeta_path") + print(f"Content identifier:\n{cid}") + print(f"Cid Object Path:\n:{cid_object_path}") + print(f"Cid Reference File Path:\n:{cid_refs_path}") + print(f"Pid Reference File Path:\n:{pid_refs_path}") + print(f"Sysmeta Path:\n:{sysmeta_path}") elif getattr(args, "client_storeobject"): if pid is None: diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 48c23663..ed93ec9a 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -106,8 +106,21 @@ def test_find_object(capsys, store, pids): sys.argv = chs_args hashstoreclient.main() + object_info_dict = store.find_object(pid) + cid = object_info_dict.get("cid") + cid_object_path = object_info_dict.get("cid") + cid_refs_path = object_info_dict.get("cid_refs_path") + pid_refs_path = object_info_dict.get("pid_refs_path") + sysmeta_path = object_info_dict.get("sysmeta_path") + capsystext = capsys.readouterr().out - expected_output = f"Content identifier: {cid}\n" + expected_output = ( + f"Content identifier:\n{cid}\n" + + f"Cid Object Path:\n:{cid_object_path}\n" + + f"Cid Reference File Path:\n:{cid_refs_path}\n" + + f"Pid Reference File Path:\n:{pid_refs_path}\n" + + f"Sysmeta Path:\n:{sysmeta_path}\n" + ) assert capsystext == expected_output From bb292ef6e130d8cd1f8983107f57b26be40ec777 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 10:07:50 -0700 Subject: [PATCH 276/420] Fix bug in 'hashstoreclient' where object_path was not referenced correctly and update pytest --- src/hashstore/hashstoreclient.py | 2 +- tests/test_hashstore_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 4a4fefe3..b02d764c 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -852,7 +852,7 @@ def main(): # Find the content identifier of the object object_info_dict = hashstore_c.hashstore.find_object(pid) cid = object_info_dict.get("cid") - cid_object_path = object_info_dict.get("cid") + cid_object_path = object_info_dict.get("cid_object_path") cid_refs_path = object_info_dict.get("cid_refs_path") pid_refs_path = object_info_dict.get("pid_refs_path") sysmeta_path = object_info_dict.get("sysmeta_path") diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index ed93ec9a..f46585d2 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -108,7 +108,7 @@ def test_find_object(capsys, store, pids): object_info_dict = store.find_object(pid) cid = object_info_dict.get("cid") - cid_object_path = object_info_dict.get("cid") + cid_object_path = object_info_dict.get("cid_object_path") cid_refs_path = object_info_dict.get("cid_refs_path") pid_refs_path = object_info_dict.get("pid_refs_path") sysmeta_path = object_info_dict.get("sysmeta_path") From 581bd8e5869c19e628d2525119c2ff85f758f965 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 10:19:50 -0700 Subject: [PATCH 277/420] Add new client pytest for 'find_object' when sysmeta exists --- tests/test_hashstore_client.py | 51 +++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index f46585d2..44e471b8 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -80,7 +80,7 @@ def test_get_checksum(capsys, store, pids): assert capsystext == expected_output -def test_find_object(capsys, store, pids): +def test_find_object_sysmeta_does_not_exist(capsys, store, pids): """Test find_object returns a content identifier if it exists.""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" @@ -124,6 +124,55 @@ def test_find_object(capsys, store, pids): assert capsystext == expected_output +def test_find_object_sysmeta_exists(capsys, store, pids): + """Test find_object returns a content identifier if it exists.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + store.store_metadata(pid, syspath, format_id) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + find_object_opt = "-findobject" + client_pid_arg = f"-pid={pid}" + chs_args = [ + client_module_path, + test_store, + find_object_opt, + client_pid_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + hashstoreclient.main() + + object_info_dict = store.find_object(pid) + cid = object_info_dict.get("cid") + cid_object_path = object_info_dict.get("cid_object_path") + cid_refs_path = object_info_dict.get("cid_refs_path") + pid_refs_path = object_info_dict.get("pid_refs_path") + sysmeta_path = object_info_dict.get("sysmeta_path") + + capsystext = capsys.readouterr().out + expected_output = ( + f"Content identifier:\n{cid}\n" + + f"Cid Object Path:\n:{cid_object_path}\n" + + f"Cid Reference File Path:\n:{cid_refs_path}\n" + + f"Pid Reference File Path:\n:{pid_refs_path}\n" + + f"Sysmeta Path:\n:{sysmeta_path}\n" + ) + assert capsystext == expected_output + + def test_store_object(store, pids): """Test storing objects to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" From a032caf4a92cdafed3129a41876101155f0b90b0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 10:46:56 -0700 Subject: [PATCH 278/420] Update new 'hashstoreclient' test docstrings --- tests/test_hashstore_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 44e471b8..4038decc 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -81,7 +81,7 @@ def test_get_checksum(capsys, store, pids): def test_find_object_sysmeta_does_not_exist(capsys, store, pids): - """Test find_object returns a content identifier if it exists.""" + """Test client's find_object prints the required values when sysmeta does not exist.""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -125,7 +125,7 @@ def test_find_object_sysmeta_does_not_exist(capsys, store, pids): def test_find_object_sysmeta_exists(capsys, store, pids): - """Test find_object returns a content identifier if it exists.""" + """Test client's find_object prints the required values when sysmeta exists""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" From 1a6670759939eb84f89c341ee0fefa64a31463bf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 12:09:17 -0700 Subject: [PATCH 279/420] Revise 'tag_object' to throw new custom exception 'HashStoreRefsAlreadyExists' when called to tag a pid and cid where refs files already exist, and update pytest --- src/hashstore/filehashstore.py | 8 +++++++- src/hashstore/filehashstore_exceptions.py | 8 ++++++++ tests/test_filehashstore_references.py | 5 ++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2f825ed3..3e201e77 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -18,6 +18,7 @@ CidRefsContentError, CidRefsDoesNotExist, CidRefsFileNotFound, + HashStoreRefsAlreadyExists, NonMatchingChecksum, NonMatchingObjSize, PidAlreadyExistsError, @@ -639,7 +640,12 @@ def tag_object(self, pid, cid): cid_refs_path, "Refs file already exists, verifying.", ) - return True + error_msg = ( + f"FileHashStore - tag_object: Object with cid: {cid}" + + f" already exists and is tagged with pid: {pid}" + ) + logging.error(error_msg) + raise HashStoreRefsAlreadyExists(error_msg) elif os.path.exists(pid_refs_path) and not os.path.exists(cid_refs_path): debug_msg = ( f"FileHashStore - tag_object: pid refs file exists ({pid_refs_path})" diff --git a/src/hashstore/filehashstore_exceptions.py b/src/hashstore/filehashstore_exceptions.py index c81f9c55..7556c3f4 100644 --- a/src/hashstore/filehashstore_exceptions.py +++ b/src/hashstore/filehashstore_exceptions.py @@ -99,6 +99,14 @@ def __init__(self, message, errors=None): self.errors = errors +class HashStoreRefsAlreadyExists(Exception): + """Custom exception thrown when called to tag an object that is already tagged appropriately.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + class UnsupportedAlgorithm(Exception): """Custom exception thrown when a given algorithm is not supported in HashStore for calculating hashes/checksums, as the default store algo and/or other operations.""" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index a1664bc5..e6d7c0f7 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -7,6 +7,7 @@ from hashstore.filehashstore_exceptions import ( CidRefsContentError, CidRefsFileNotFound, + HashStoreRefsAlreadyExists, NonMatchingChecksum, NonMatchingObjSize, PidAlreadyExistsError, @@ -86,7 +87,9 @@ def test_tag_object_pid_refs_found_cid_refs_found(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.cid store.tag_object(pid, cid) - store.tag_object(pid, cid) + + with pytest.raises(HashStoreRefsAlreadyExists): + store.tag_object(pid, cid) cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) line_count = 0 From dc5973ab24d45d6c170a39d194e734d35f83c3a5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 12:17:46 -0700 Subject: [PATCH 280/420] Refactor pytest 'test_store_object_duplicates_threads's 'store_object_wrapper' method to use try-except and assert the expected 'HashStoreRefsAlreadyExists' exception --- tests/test_filehashstore_interface.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 578511ce..8f2b994a 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -11,6 +11,7 @@ from hashstore.filehashstore_exceptions import ( CidRefsDoesNotExist, + HashStoreRefsAlreadyExists, NonMatchingChecksum, NonMatchingObjSize, PidNotFoundInCidRefsFile, @@ -498,7 +499,10 @@ def test_store_object_duplicates_threads(pids, store): entity = "objects" def store_object_wrapper(obj_pid, obj_path): - store.store_object(obj_pid, obj_path) # Call store_object inside the thread + try: + store.store_object(obj_pid, obj_path) # Call store_object inside the thread + except Exception as e: + assert type(e).__name__ == "HashStoreRefsAlreadyExists" thread1 = Thread(target=store_object_wrapper, args=(pid, path)) thread2 = Thread(target=store_object_wrapper, args=(pid, path)) From 04f7379f4b7ed1fd818674786ae997c8cddef036 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 25 Jun 2024 12:42:39 -0700 Subject: [PATCH 281/420] Cleanup 'test_filehashstore_interface' module --- tests/test_filehashstore_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 8f2b994a..08ecb053 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -11,7 +11,6 @@ from hashstore.filehashstore_exceptions import ( CidRefsDoesNotExist, - HashStoreRefsAlreadyExists, NonMatchingChecksum, NonMatchingObjSize, PidNotFoundInCidRefsFile, @@ -501,6 +500,7 @@ def test_store_object_duplicates_threads(pids, store): def store_object_wrapper(obj_pid, obj_path): try: store.store_object(obj_pid, obj_path) # Call store_object inside the thread + # pylint: disable=W0718 except Exception as e: assert type(e).__name__ == "HashStoreRefsAlreadyExists" From 8b1b24ad8d3f9c51835c880d83bfd131488f2a46 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 1 Jul 2024 12:44:24 -0700 Subject: [PATCH 282/420] Update 'pyproject.toml' to replace specific versions (^) with minimum-greater than versions (>=), add new classifiers and keyword section --- poetry.lock | 34 +++------------------------------- pyproject.toml | 31 +++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 39 deletions(-) diff --git a/poetry.lock b/poetry.lock index 85abf43e..538bc8be 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "asn1crypto" version = "1.5.1" description = "Fast ASN.1 parser and serializer with definitions for private keys, public keys, certificates, CRL, OCSP, CMS, PKCS#3, PKCS#7, PKCS#8, PKCS#12, PKCS#5, X.509 and TSP" -category = "dev" optional = false python-versions = "*" files = [ @@ -16,7 +15,6 @@ files = [ name = "astroid" version = "2.15.6" description = "An abstract syntax tree for Python with inference support." -category = "dev" optional = false python-versions = ">=3.7.2" files = [ @@ -36,7 +34,6 @@ wrapt = [ name = "black" version = "22.12.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -72,7 +69,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "click" version = "8.1.5" description = "Composable command line interface toolkit" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -87,7 +83,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -99,7 +94,6 @@ files = [ name = "dill" version = "0.3.6" description = "serialize all of python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -114,7 +108,6 @@ graph = ["objgraph (>=1.7.2)"] name = "exceptiongroup" version = "1.1.2" description = "Backport of PEP 654 (exception groups)" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -129,7 +122,6 @@ test = ["pytest (>=6)"] name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -141,7 +133,6 @@ files = [ name = "isort" version = "5.12.0" description = "A Python utility / library to sort Python imports." -category = "dev" optional = false python-versions = ">=3.8.0" files = [ @@ -159,7 +150,6 @@ requirements-deprecated-finder = ["pip-api", "pipreqs"] name = "lazy-object-proxy" version = "1.9.0" description = "A fast and thorough lazy object proxy." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -205,7 +195,6 @@ files = [ name = "mccabe" version = "0.7.0" description = "McCabe checker, plugin for flake8" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -217,7 +206,6 @@ files = [ name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -229,7 +217,6 @@ files = [ name = "packaging" version = "23.1" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -241,7 +228,6 @@ files = [ name = "pathlib" version = "1.0.1" description = "Object-oriented filesystem paths" -category = "main" optional = false python-versions = "*" files = [ @@ -253,7 +239,6 @@ files = [ name = "pathspec" version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -265,7 +250,6 @@ files = [ name = "pg8000" version = "1.29.8" description = "PostgreSQL interface library" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -281,7 +265,6 @@ scramp = ">=1.4.3" name = "platformdirs" version = "3.8.1" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -297,7 +280,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest- name = "pluggy" version = "1.2.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -313,7 +295,6 @@ testing = ["pytest", "pytest-benchmark"] name = "pylint" version = "2.17.4" description = "python code static checker" -category = "dev" optional = false python-versions = ">=3.7.2" files = [ @@ -343,7 +324,6 @@ testutils = ["gitpython (>3)"] name = "pytest" version = "7.4.0" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -366,7 +346,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -381,7 +360,6 @@ six = ">=1.5" name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -431,7 +409,6 @@ files = [ name = "scramp" version = "1.4.4" description = "An implementation of the SCRAM protocol." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -446,7 +423,6 @@ asn1crypto = ">=1.5.1" name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -458,7 +434,6 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -470,7 +445,6 @@ files = [ name = "tomlkit" version = "0.11.8" description = "Style preserving TOML library" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -482,7 +456,6 @@ files = [ name = "typing-extensions" version = "4.7.1" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -494,7 +467,6 @@ files = [ name = "wrapt" version = "1.15.0" description = "Module for decorators, wrappers and monkey patching." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -577,5 +549,5 @@ files = [ [metadata] lock-version = "2.0" -python-versions = "^3.9" -content-hash = "6eeffad7b4becc9f995e576d3fc5db2a8640bfe60876d254a6b5854ddd0e283a" +python-versions = ">=3.9" +content-hash = "29d95a36557ed6e054de245ce01f8cc49055e3b478d030a891aa3ee57b981245" diff --git a/pyproject.toml b/pyproject.toml index c11c4a14..c75a1c8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,35 @@ [tool.poetry] name = "hashstore" version = "1.1.0" -description = "HashStore, a hash-based object store for data packages." +description = "HashStore, an object storage system using content identifiers." authors = ["Dou Mok ", "Matt Jones "] readme = "README.md" +keywords = ["filesystem", "object storage", "hashstore", "storage"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: System :: Filesystems" +] [tool.poetry.dependencies] -python = "^3.9" -pathlib = "^1.0.1" -pyyaml = "^6.0" +python = ">=3.9" +pathlib = ">=1.0.1" +pyyaml = ">=6.0" [tool.poetry.group.dev.dependencies] -pytest = "^7.2.0" -black = "^22.10.0" -pylint = "^2.17.4" -pg8000 = "^1.29.8" +pytest = ">=7.2.0" +black = ">=22.10.0" +pylint = ">=2.17.4" +pg8000 = ">=1.29.8" [tool.poetry.scripts] hashstore = "hashstore.hashstoreclient:main" From 1f87869f8929522812467ddcafadfa39aac2d254 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 1 Jul 2024 13:15:05 -0700 Subject: [PATCH 283/420] Update 'hashstore's '__init__.py' docstring for clarity --- src/hashstore/__init__.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 352bd3d3..14a40b8c 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -1,18 +1,19 @@ -"""HashStore is a hash-based object store for data packages. It uses -cryptographic hash functions to name files consistently. -HashStore creates a directory where objects and metadata are -stored using a hash value as the name. +"""HashStore is an object storage file system that provides persistent file-based +storage using content identifiers/hashes to de-duplicate data. HashStore is mainly focused on storing DataONE data package contents on a shared file system for simple and fast access by data management processes that function across a cluster environment. Some properties: - Data objects are immutable and never change -- Data objects are named using the SHA-256, base64-encoded hash of their contents +- Data objects are named using the base64-encoded hash of their contents (thus, a content-identifier) -- Metadata objects are stored with the formatId, a null character and its contents -- Metadata objects are named using the SHA-256 + formatId, base64-encoded hash of - their persistent identifier (PID) +- Metadata documents for a given identifier are stored in a directory structure + based on the base64-encoded hash of the identifier +- Metadata objects are named using the base64-encoded hash of the given identifier + + its respective format_id/namespace +- The relationships between data objects and metadata are managed with a reference + system. """ from hashstore.hashstore import HashStore, HashStoreFactory, ObjectMetadata From 9e53c5b196008244a4cbea793b57226668a99cfd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 1 Jul 2024 13:16:39 -0700 Subject: [PATCH 284/420] Update 'README.md' author section and hashstore creation example --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ec3dde38..ed98a57e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## HashStore: hash-based object storage for DataONE data packages -- **Author**: Dou Mok, Matthew Brooke, Jing Tao, Matthew B. Jones +- **Author**: Dou Mok, Matthew Brooke, Jing Tao, Jeanette Clarke, Ian Nesbitt, Matthew B. Jones - **License**: [Apache 2](http://opensource.org/licenses/Apache-2.0) - [Package source code on GitHub](https://github.com/DataONEorg/hashstore) - [**Submit Bugs and feature requests**](https://github.com/DataONEorg/hashstore/issues) @@ -55,7 +55,7 @@ properties = { "store_path": "/path/to/your/store", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "SHA-256", "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } From eeecc1c6f4a87658c3744f7a6578566200385d35 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 1 Jul 2024 13:19:28 -0700 Subject: [PATCH 285/420] Update 'README.md' and 'HashStoreFactory's 'get_hashstore' method for sysmeta format accuracy --- README.md | 2 +- src/hashstore/hashstore.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ed98a57e..338a54ce 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ properties = { "store_depth": 3, "store_width": 2, "store_algorithm": "SHA-256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } # Get HashStore from factory diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 038d9ec1..cf874864 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -216,8 +216,8 @@ def get_hashstore(module_name, class_name, properties=None): "store_path": "var/metacat", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0" + "store_algorithm": "SHA-256", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata" } :return: HashStore - A hash store object based on the given `module_name` and `class_name`. From 17e2eff51d12797435086a2cd861dc0710daf85c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 1 Jul 2024 13:24:42 -0700 Subject: [PATCH 286/420] Update 'CONTRIBUTING.md' to be more friendly by including emojis --- CONTRIBUTING.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d962e28..59e9c99f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,16 +1,16 @@ # Contributing to HashStore -:tada: First off, thanks for contributing! :tada: +**🎉 First off, thanks for contributing! 🎉** -- [Types of contributions](#types-of-contributions) -- [Pull Requests](#pull-requests) -- [Development Workflow](#development-workflow) -- [Release process](#release-process) -- [Testing](#testing) -- [Code style](#code-style) -- [Contributor license agreement](#contributor-license-agreement) +- [✨ Types of Contributions](#types-of-contributions) +- [🌳 Pull Requests](#pull-requests) +- [🔀 Development Workflow](#development-workflow) +- [🚀 Release Process](#release-process) +- [🔬 Testing](#testing) +- [🎨 Code Style](#code-style) +- [📄 Contributor License Agreement](#contributor-license-agreement) -## Types of contributions +## ✨ Types of Contributions We welcome all types of contributions, including bug fixes, feature enhancements, bug reports, documentation, graphics, and many others. You might consider contributing by: @@ -29,7 +29,7 @@ made to increase the value of HashStore to the community. We strive to incorporate code, documentation, and other useful contributions quickly and efficiently while maintaining a high-quality software product. -## Pull Requests +## 🌳 Pull Requests We use the pull-request model for contributions. See [GitHub's help on pull-requests](https://help.github.com/articles/about-pull-requests/). In short: @@ -43,7 +43,7 @@ In short: - our team may request changes before we will approve the Pull Request, or we will make them for you - once the code is reviewed, our team will merge in your changes to `develop` for the next planned release -## Development Workflow +## 🔀 Development Workflow Development is managed through the git repository at https://github.com/DataONEorg/hashstore. The repository is organized into several branches, each with a specific purpose. @@ -104,7 +104,7 @@ gitGraph merge develop id: "11" tag: "v1.1.0" ``` -## Release process +## 🚀 Release Process 1. Our release process starts with integration testing in a `develop` branch. Once all changes that are desired in a release are merged into the `develop` branch, we run @@ -115,7 +115,7 @@ reflect the new release and the `develop` branch can be fast-forwarded to sync w start work on the next release. 3. Releases can be downloaded from the [GitHub releases page](https://github.com/DataONEorg/hashstore/releases). -## Testing +## 🔬 Testing **Unit and integration tests**. HashStore has a full suite of `pytest` tests in the `tests` subdirectory. Any new code developed should include a robust set of tests for each public @@ -127,7 +127,7 @@ or merging to `develop`. Tests are automatically run via GitHub Actions. Check the root `README.md` file for this GitHub Actions status badge and make sure it says "Passing": -## Code style +## 🎨 Code Style Code should be written to professional standards to enable clean, well-documented, readable, and maintainable software. While there has been significant variability @@ -135,7 +135,7 @@ in the coding styles applied historically, new contributions should strive for clean code formatting. We generally follow PEP8 guidelines for Python code formatting, typically enforced through the `black` code formatting package. -## Contributor license agreement +## 📄 Contributor License Agreement In order to clarify the intellectual property license granted with Contributions from any person or entity, you agree to From e45d038ddfc6bbf4f57e140697db91f8d69e45d0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 2 Jul 2024 14:32:55 -0700 Subject: [PATCH 287/420] Fix incorrect module_name in README.md example for instantiating a HashStore --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ec3dde38..20a39410 100644 --- a/README.md +++ b/README.md @@ -55,12 +55,12 @@ properties = { "store_path": "/path/to/your/store", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_algorithm": "SHA-256", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } # Get HashStore from factory -module_name = "hashstore.filehashstore.filehashstore" +module_name = "hashstore.filehashstore" class_name = "FileHashStore" my_store = hashstore_factory.get_hashstore(module_name, class_name, properties) From 2ad42aac29aefc2feb92f07d18ba5d84b1e6fa87 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 2 Jul 2024 14:40:25 -0700 Subject: [PATCH 288/420] Move 'threading' and 'multiprocessing' variable declaration to '__init__' process --- src/hashstore/filehashstore.py | 48 +++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3e201e77..e96ef690 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -73,31 +73,37 @@ class FileHashStore(HashStore): "blake2b", "blake2s", ] - # Variables to orchestrate parallelization - # Thread Synchronization - object_lock = threading.Lock() - object_condition = threading.Condition(object_lock) - object_locked_pids = [] - metadata_lock = threading.Lock() - metadata_condition = threading.Condition(metadata_lock) - metadata_locked_docs = [] - reference_lock = threading.Lock() - reference_condition = threading.Condition(reference_lock) - reference_locked_cids = [] - # Multiprocessing Synchronization - object_lock_mp = multiprocessing.Lock() - object_condition_mp = multiprocessing.Condition(object_lock_mp) - object_locked_pids_mp = multiprocessing.Manager().list() - metadata_lock_mp = multiprocessing.Lock() - metadata_condition_mp = multiprocessing.Condition(metadata_lock_mp) - metadata_locked_docs_mp = multiprocessing.Manager().list() - reference_lock_mp = multiprocessing.Lock() - reference_condition_mp = multiprocessing.Condition(reference_lock_mp) - reference_locked_cids_mp = multiprocessing.Manager().list() def __init__(self, properties=None): + # Variables to orchestrate parallelization # Check to see whether a multiprocessing or threading sync lock should be used self.use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" + if self.use_multiprocessing == "True": + # Create multiprocessing synchronization variables + self.object_lock_mp = multiprocessing.Lock() + self.object_condition_mp = multiprocessing.Condition(self.object_lock_mp) + self.object_locked_pids_mp = multiprocessing.Manager().list() + self.metadata_lock_mp = multiprocessing.Lock() + self.metadata_condition_mp = multiprocessing.Condition( + self.metadata_lock_mp + ) + self.metadata_locked_docs_mp = multiprocessing.Manager().list() + self.reference_lock_mp = multiprocessing.Lock() + self.reference_condition_mp = multiprocessing.Condition( + self.reference_lock_mp + ) + self.reference_locked_cids_mp = multiprocessing.Manager().list() + else: + # Create threading synchronization variables + self.object_lock = threading.Lock() + self.object_condition = threading.Condition(self.object_lock) + self.object_locked_pids = [] + self.metadata_lock = threading.Lock() + self.metadata_condition = threading.Condition(self.metadata_lock) + self.metadata_locked_docs = [] + self.reference_lock = threading.Lock() + self.reference_condition = threading.Condition(self.reference_lock) + self.reference_locked_cids = [] # Now check properties if properties: # Validate properties against existing configuration if present From 83d2421ded1415fa4d6fcbf2503372d834f0e9f0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 2 Jul 2024 14:40:59 -0700 Subject: [PATCH 289/420] Update 'hashstoreclient' to set multiprocessing flag based on the test flag --- src/hashstore/hashstoreclient.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index b02d764c..94292957 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -257,13 +257,13 @@ def __init__(self, properties, testflag=None): # Get HashStore from factory if testflag: + # Set multiprocessing to true if testing in knbvm module_name = "filehashstore" + os.environ["USE_MULTIPROCESSING"] = "True" else: module_name = "hashstore.filehashstore" class_name = "FileHashStore" - # Set multiprocessing to true - os.environ["USE_MULTIPROCESSING"] = "True" use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" logging.info( "HashStoreClient - use_multiprocessing (bool): %s", use_multiprocessing From 8d9d6540dc8e421804d04e7e9f5bc8fd840911b9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 11:40:03 -0700 Subject: [PATCH 290/420] Temporarily revert version back to '1.0.0' --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c75a1c8c..47bf6dbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hashstore" -version = "1.1.0" +version = "1.0.0" description = "HashStore, an object storage system using content identifiers." authors = ["Dou Mok ", "Matt Jones "] readme = "README.md" From 6d754bc091b81bdd9a5fe0a0ea73b34d23de8649 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:20:46 -0700 Subject: [PATCH 291/420] Refactor '_check_string' method to use inspect library to grab calling method name instead of an explicit argument from calling method --- src/hashstore/filehashstore.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e96ef690..e0345d19 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -8,6 +8,7 @@ import hashlib import os import logging +import inspect from pathlib import Path from contextlib import closing from tempfile import NamedTemporaryFile @@ -223,13 +224,7 @@ def _write_properties(self, properties): checked_properties = self._validate_properties(properties) # Collect configuration properties from validated & supplied dictionary - ( - _, - store_depth, - store_width, - store_algorithm, - store_metadata_namespace, - ) = [ + (_, store_depth, store_width, store_algorithm, store_metadata_namespace,) = [ checked_properties[property_name] for property_name in self.property_required_keys ] @@ -479,7 +474,7 @@ def store_object( "FileHashStore - store_object: Request to store object for pid: %s", pid ) # Validate input parameters - self._check_string(pid, "pid", "store_object") + self._check_string(pid, "pid") self._check_arg_data(data) self._check_integer(expected_object_size) ( @@ -2537,18 +2532,19 @@ def _check_integer(file_size): raise ValueError(exception_string) @staticmethod - def _check_string(string, arg, method): + def _check_string(string, arg): """Check whether a string is None or empty; throw an exception if so. :param str string: Value to check. :param str arg: Name of the argument to check. - :param str method: Calling method for logging purposes. """ if string is None or string.strip() == "": + method = inspect.stack()[1].function exception_string = ( f"FileHashStore - {method}: {arg} cannot be None" + f" or empty, {arg}: {string}." ) + print(exception_string) logging.error(exception_string) raise ValueError(exception_string) From d81ebc76f4ae8f603d2e297b72558ad1668ce308 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:23:31 -0700 Subject: [PATCH 292/420] Fix functions calling '_check_string' --- src/hashstore/filehashstore.py | 37 +++++++++++++--------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e0345d19..508fe7f1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -560,8 +560,8 @@ def store_object( def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - self._check_string(checksum, "checksum", "verify_object") - self._check_string(checksum_algorithm, "checksum_algorithm", "verify_object") + self._check_string(checksum, "checksum") + self._check_string(checksum_algorithm, "checksum_algorithm") self._check_integer(expected_file_size) if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): exception_string = ( @@ -601,8 +601,8 @@ def tag_object(self, pid, cid): cid, pid, ) - self._check_string(pid, "pid", "tag_object") - self._check_string(cid, "cid", "tag_object") + self._check_string(pid, "pid") + self._check_string(cid, "cid") sync_begin_debug_msg = ( f"FileHashStore - tag_object: Adding cid ({pid}) to locked list." @@ -758,7 +758,7 @@ def find_object(self, pid): logging.debug( "FileHashStore - find_object: Request to find object for for pid: %s", pid ) - self._check_string(pid, "pid", "find_object") + self._check_string(pid, "pid") pid_ref_abs_path = self._resolve_path("pid", pid) if os.path.exists(pid_ref_abs_path): @@ -832,7 +832,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) # Validate input parameters - self._check_string(pid, "pid", "store_metadata") + self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "store_metadata") self._check_arg_data(metadata) pid_doc = self._computehash(pid + checked_format_id) @@ -892,7 +892,7 @@ def retrieve_object(self, pid): "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, ) - self._check_string(pid, "pid", "retrieve_object") + self._check_string(pid, "pid") object_info_dict = self.find_object(pid) object_cid = object_info_dict.get("cid") @@ -921,7 +921,7 @@ def retrieve_metadata(self, pid, format_id=None): "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) - self._check_string(pid, "pid", "retrieve_metadata") + self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" @@ -951,7 +951,7 @@ def delete_object(self, ab_id, id_type=None): logging.debug( "FileHashStore - delete_object: Request to delete object for id: %s", ab_id ) - self._check_string(ab_id, "ab_id", "delete_object") + self._check_string(ab_id, "ab_id") if id_type == "cid": cid_refs_abs_path = self._resolve_path("cid", ab_id) @@ -1200,7 +1200,7 @@ def delete_metadata(self, pid, format_id=None): "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - self._check_string(pid, "pid", "delete_metadata") + self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") metadata_directory = self._computehash(pid) rel_path = "/".join(self._shard(metadata_directory)) @@ -1330,8 +1330,8 @@ def get_hex_digest(self, pid, algorithm): "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, ) - self._check_string(pid, "pid", "get_hex_digest") - self._check_string(algorithm, "algorithm", "get_hex_digest") + self._check_string(pid, "pid") + self._check_string(algorithm, "algorithm") entity = "objects" algorithm = self._clean_algorithm(algorithm) @@ -2146,17 +2146,9 @@ def _check_arg_algorithms_and_checksum( additional_algorithm_checked = self._clean_algorithm(additional_algorithm) checksum_algorithm_checked = None if checksum is not None: - self._check_string( - checksum_algorithm, - "checksum_algorithm", - "_check_arg_algorithms_and_checksum (store_object)", - ) + self._check_string(checksum_algorithm, "checksum_algorithm") if checksum_algorithm is not None: - self._check_string( - checksum, - "checksum", - "_check_arg_algorithms_and_checksum (store_object)", - ) + self._check_string(checksum, "checksum") # Set checksum_algorithm checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked @@ -2544,7 +2536,6 @@ def _check_string(string, arg): f"FileHashStore - {method}: {arg} cannot be None" + f" or empty, {arg}: {string}." ) - print(exception_string) logging.error(exception_string) raise ValueError(exception_string) From 10b5de58ce186adc2f4ec9434eed9449237f73a5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:31:20 -0700 Subject: [PATCH 293/420] Add new pytest for '_check_string' and also revise '_check_string' to check if any character is illegal --- src/hashstore/filehashstore.py | 2 +- tests/test_filehashstore.py | 34 +++++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 508fe7f1..57112baf 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2530,7 +2530,7 @@ def _check_string(string, arg): :param str string: Value to check. :param str arg: Name of the argument to check. """ - if string is None or string.strip() == "": + if string is None or string.strip() == "" or any(ch.isspace() for ch in string): method = inspect.stack()[1].function exception_string = ( f"FileHashStore - {method}: {arg} cannot be None" diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 6c8ff99c..12374449 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -11,6 +11,7 @@ UnsupportedAlgorithm, ) + # pylint: disable=W0212 @@ -355,7 +356,7 @@ def test_store_data_only_file_size(pids, store): def test_store_data_only_hex_digests(pids, store): - """Check _store_data_only generates hex digests dictionary.""" + """Check _store_data_only generates a hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -467,12 +468,7 @@ def test_move_and_get_checksums_incorrect_file_size(pids, store): input_stream = io.open(path, "rb") incorrect_file_size = 1000 # pylint: disable=W0212 - ( - _, - _, - _, - _, - ) = store._move_and_get_checksums( + (_, _, _, _,) = store._move_and_get_checksums( pid, input_stream, file_size_to_validate=incorrect_file_size ) input_stream.close() @@ -1115,3 +1111,27 @@ def test_resolve_path_refs_cid(pids, store): calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid)) assert resolved_cid_ref_abs_path == calculated_cid_ref_path + + +def test_check_string(store): + """Confirm that an exception is raised when a string is None, empty or an illegal character + (ex. tabs or new lines)""" + empty_pid_with_spaces = " " + with pytest.raises(ValueError): + store._check_string(empty_pid_with_spaces, "empty_pid_with_spaces") + + none_value = None + with pytest.raises(ValueError): + store._check_string(none_value, "none_value") + + new_line = "\n" + with pytest.raises(ValueError): + store._check_string(new_line, "new_line") + + new_line_with_other_chars = "hello \n" + with pytest.raises(ValueError): + store._check_string(new_line_with_other_chars, "new_line_with_other_chars") + + tab_line = "\t" + with pytest.raises(ValueError): + store._check_string(tab_line, "tab_line") From e345834a9d48d36e9cc1e2e74da1ec9f8294cd14 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:32:08 -0700 Subject: [PATCH 294/420] Revise doc strings --- src/hashstore/filehashstore.py | 3 ++- tests/test_filehashstore.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 57112baf..6717afd4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2525,7 +2525,8 @@ def _check_integer(file_size): @staticmethod def _check_string(string, arg): - """Check whether a string is None or empty; throw an exception if so. + """Check whether a string is None or empty - or if it contains an illegal character; + throws an exception if so. :param str string: Value to check. :param str arg: Name of the argument to check. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 12374449..26881976 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1114,8 +1114,8 @@ def test_resolve_path_refs_cid(pids, store): def test_check_string(store): - """Confirm that an exception is raised when a string is None, empty or an illegal character - (ex. tabs or new lines)""" + """Confirm that an exception is raised when a string is None, empty or contains an illegal + character (ex. tabs or new lines)""" empty_pid_with_spaces = " " with pytest.raises(ValueError): store._check_string(empty_pid_with_spaces, "empty_pid_with_spaces") From 37a38d8f3a6c9bb9ace22e155f17af5b5cdb3423 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:39:56 -0700 Subject: [PATCH 295/420] Fix inaccurate docstrings in '_store_and_validate_data' and related methods RE: 'file_size_to_validate' arg --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6717afd4..7faa3ec8 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1375,7 +1375,7 @@ def _store_and_validate_data( :param str checksum: Optional checksum to validate object against hex digest before moving to permanent location. :param str checksum_algorithm: Algorithm value of the given checksum. - :param bytes file_size_to_validate: Expected size of the object. + :param int file_size_to_validate: Expected size of the object. :return: ObjectMetadata - object that contains the object id, object file size, and hex digest dictionary. @@ -1484,7 +1484,7 @@ def _move_and_get_checksums( :param str checksum: Optional checksum to validate the object against hex digest before moving to the permanent location. :param str checksum_algorithm: Algorithm value of the given checksum. - :param bytes file_size_to_validate: Expected size of the object. + :param int file_size_to_validate: Expected size of the object. :return: tuple - Object ID, object file size, and hex digest dictionary. """ From 812905083dbaeba17b5dbf220142a296851250c3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:53:42 -0700 Subject: [PATCH 296/420] Revise docstrings to resolve linting warnings --- src/hashstore/filehashstore.py | 26 +++++++++++++------------- src/hashstore/hashstore.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7faa3ec8..b9f7addf 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -630,8 +630,8 @@ def tag_object(self, pid, cid): pid_refs_path = self._resolve_path("pid", pid) cid_refs_path = self._resolve_path("cid", cid) # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_refs_path)) - self._create_path(os.path.dirname(cid_refs_path)) + self._create_path(Path(os.path.dirname(pid_refs_path))) + self._create_path(Path(os.path.dirname(cid_refs_path))) if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): self._verify_hashstore_references( @@ -1476,7 +1476,7 @@ def _move_and_get_checksums( not match what is provided). :param str pid: Authority-based identifier. - :param io.BufferedReader stream: Object stream. + :param Stream stream: Object stream. :param str extension: Optional extension to append to the file when saving. :param str additional_algorithm: Optional algorithm value to include @@ -1524,7 +1524,7 @@ def _move_and_get_checksums( tmp_file_size, file_size_to_validate, ) - self._create_path(os.path.dirname(abs_file_path)) + self._create_path(Path(os.path.dirname(abs_file_path))) try: debug_msg = ( "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" @@ -1625,7 +1625,7 @@ def _write_to_tmp_file_and_get_hex_digests( algorithm is provided, it will add the respective hex digest to the dictionary if it is supported. - :param io.BufferedReader stream: Object stream. + :param Stream stream: Object stream. :param str additional_algorithm: Algorithm of additional hex digest to generate. :param str checksum_algorithm: Algorithm of additional checksum algo to generate. @@ -1711,7 +1711,7 @@ def _write_to_tmp_file_and_get_hex_digests( def _mktmpfile(self, path): """Create a temporary file at the given path ready to be written. - :param str path: Path to the file location. + :param Path path: Path to the file location. :return: file object - object with a file-like interface. """ @@ -1743,7 +1743,7 @@ def _write_refs_file(self, path, ref_id, ref_type): difference being that a cid reference file can potentially contain multiple lines of `pid`s that reference the `cid`. - :param str path: Directory to write a temporary file into + :param path path: Directory to write a temporary file into :param str ref_id: Authority-based, persistent or content identifier :param str ref_type: 'cid' or 'pid' @@ -1907,7 +1907,7 @@ def _put_metadata(self, metadata, pid, metadata_doc_name): def _mktmpmetadata(self, stream): """Create a named temporary file with `stream` (metadata). - :param io.BufferedReader stream: Metadata stream. + :param Stream stream: Metadata stream. :return: Path/name of temporary file created and written into. :rtype: str @@ -1947,12 +1947,12 @@ def _verify_object_information( """Evaluates an object's integrity - if there is a mismatch, deletes the object in question and raises an exception. - :param str pid: For logging purposes. + :param Optional[str] pid: For logging purposes. :param str checksum: Value of the checksum to check. :param str checksum_algorithm: Algorithm of the checksum. :param str entity: Type of object ('objects' or 'metadata'). :param dict hex_digests: Dictionary of hex digests to parse. - :param str tmp_file_name: Name of the temporary file. + :param Optional[str] tmp_file_name: Name of the temporary file. :param int tmp_file_size: Size of the temporary file. :param int file_size_to_validate: Expected size of the object. """ @@ -2340,7 +2340,7 @@ def _delete(self, entity, file): def _rename_path_for_deletion(path): """Rename a given path by appending '_delete' and move it to the renamed path. - :param Path path: Path to file to rename + :param string path: Path to file to rename :return: Path to the renamed file :rtype: str @@ -2354,7 +2354,7 @@ def _rename_path_for_deletion(path): def _create_path(self, path): """Physically create the folder path (and all intermediate ones) on disk. - :param str path: The path to create. + :param Path path: The path to create. :raises AssertionError: If the path already exists but is not a directory. """ try: @@ -2544,7 +2544,7 @@ def _check_string(string, arg): def _cast_to_bytes(text): """Convert text to a sequence of bytes using utf-8 encoding. - :param str text: String to convert. + :param Any text: String to convert. :return: Bytes with utf-8 encoding. :rtype: bytes """ diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index cf874864..a40c7735 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -253,7 +253,7 @@ class ObjectMetadata( :param str pid: An authority-based or persistent identifier :param str cid: A unique identifier for the object (Hash ID, hex digest). - :param bytes obj_size: The size of the object in bytes. + :param int obj_size: The size of the object in bytes. :param list hex_digests: A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) (optional). """ From 9a9057b5c2565f3a1cdd39da6095d08de4b14687 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 9 Sep 2024 11:59:03 -0700 Subject: [PATCH 297/420] Refactor 'ObjectMetadata' class from namedtuple to be a dataclass --- src/hashstore/hashstore.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index a40c7735..e3a9b4e9 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -1,7 +1,7 @@ """Hashstore Interface""" from abc import ABC, abstractmethod -from collections import namedtuple +from dataclasses import dataclass import importlib.metadata import importlib.util @@ -241,9 +241,8 @@ def get_hashstore(module_name, class_name, properties=None): ) -class ObjectMetadata( - namedtuple("ObjectMetadata", ["pid", "cid", "obj_size", "hex_digests"]) -): +@dataclass +class ObjectMetadata: """Represents metadata associated with an object. The `ObjectMetadata` class represents metadata associated with an object, including @@ -258,6 +257,7 @@ class ObjectMetadata( (md5, sha1, sha256, sha384, sha512) (optional). """ - # Default value to prevent dangerous default value - def __new__(cls, pid, cid, obj_size, hex_digests=None): - return super(ObjectMetadata, cls).__new__(cls, pid, cid, obj_size, hex_digests) + pid: str + cid: str + obj_size: int + hex_digests: dict From 7f44395af0184801346e2c111fc792bbc6c505cf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 9 Sep 2024 12:06:53 -0700 Subject: [PATCH 298/420] Extract 'ObjectMetadata' class from 'HashStore' to 'FileHashStore', revise and organize affected code --- src/hashstore/__init__.py | 4 ++-- src/hashstore/filehashstore.py | 29 ++++++++++++++++++++++++++--- src/hashstore/hashstore.py | 22 ---------------------- tests/test_filehashstore.py | 25 ++++++++++++++++++++++++- tests/test_hashstore.py | 25 +------------------------ 5 files changed, 53 insertions(+), 52 deletions(-) diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 14a40b8c..a841efa3 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -16,6 +16,6 @@ system. """ -from hashstore.hashstore import HashStore, HashStoreFactory, ObjectMetadata +from hashstore.hashstore import HashStore, HashStoreFactory -__all__ = ("HashStore", "HashStoreFactory", "ObjectMetadata") +__all__ = ("HashStore", "HashStoreFactory") diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b9f7addf..2be657c4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -9,12 +9,13 @@ import os import logging import inspect +import fcntl +import yaml +from dataclasses import dataclass from pathlib import Path from contextlib import closing from tempfile import NamedTemporaryFile -import fcntl -import yaml -from hashstore import HashStore, ObjectMetadata +from hashstore import HashStore from hashstore.filehashstore_exceptions import ( CidRefsContentError, CidRefsDoesNotExist, @@ -2611,3 +2612,25 @@ def close(self): self._obj.close() else: self._obj.seek(self._pos) + + +@dataclass +class ObjectMetadata: + """Represents metadata associated with an object. + + The `ObjectMetadata` class represents metadata associated with an object, including + a persistent or authority-based identifier (`pid`), a content identifier (`cid`), + the size of the object in bytes (`obj_size`), and an optional list of hex digests + (`hex_digests`) to assist with validating objects. + + :param str pid: An authority-based or persistent identifier + :param str cid: A unique identifier for the object (Hash ID, hex digest). + :param int obj_size: The size of the object in bytes. + :param list hex_digests: A list of hex digests to validate objects + (md5, sha1, sha256, sha384, sha512) (optional). + """ + + pid: str + cid: str + obj_size: int + hex_digests: dict diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index e3a9b4e9..90be1da7 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -239,25 +239,3 @@ def get_hashstore(module_name, class_name, properties=None): raise AttributeError( f"Class name '{class_name}' is not an attribute of module '{module_name}'" ) - - -@dataclass -class ObjectMetadata: - """Represents metadata associated with an object. - - The `ObjectMetadata` class represents metadata associated with an object, including - a persistent or authority-based identifier (`pid`), a content identifier (`cid`), - the size of the object in bytes (`obj_size`), and an optional list of hex digests - (`hex_digests`) to assist with validating objects. - - :param str pid: An authority-based or persistent identifier - :param str cid: A unique identifier for the object (Hash ID, hex digest). - :param int obj_size: The size of the object in bytes. - :param list hex_digests: A list of hex digests to validate objects - (md5, sha1, sha256, sha384, sha512) (optional). - """ - - pid: str - cid: str - obj_size: int - hex_digests: dict diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 26881976..487b0e54 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -4,7 +4,7 @@ import os from pathlib import Path import pytest -from hashstore.filehashstore import FileHashStore +from hashstore.filehashstore import FileHashStore, ObjectMetadata from hashstore.filehashstore_exceptions import ( NonMatchingChecksum, NonMatchingObjSize, @@ -1135,3 +1135,26 @@ def test_check_string(store): tab_line = "\t" with pytest.raises(ValueError): store._check_string(tab_line, "tab_line") + + +def test_objectmetadata(store): + """Test ObjectMetadata class returns correct values via dot notation.""" + pid = "hashstore" + ab_id = "hashstoretest" + obj_size = 1234 + hex_digest_dict = { + "md5": "md5value", + "sha1": "sha1value", + "sha224": "sha224value", + "sha256": "sha256value", + "sha512": "sha512value", + } + object_metadata = ObjectMetadata(pid, ab_id, obj_size, hex_digest_dict) + assert object_metadata.pid == pid + assert object_metadata.cid == ab_id + assert object_metadata.obj_size == obj_size + assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] + assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] + assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"] + assert object_metadata.hex_digests.get("sha256") == hex_digest_dict["sha256"] + assert object_metadata.hex_digests.get("sha512") == hex_digest_dict["sha512"] diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index bb2c1ac5..140d473a 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -2,7 +2,7 @@ import os import pytest -from hashstore.hashstore import ObjectMetadata, HashStoreFactory +from hashstore.hashstore import HashStoreFactory from hashstore.filehashstore import FileHashStore @@ -156,26 +156,3 @@ def test_factory_get_hashstore_filehashstore_nonconflicting_dir(factory, tmp_pat } factory.get_hashstore(module_name, class_name, properties) - - -def test_objectmetadata(): - """Test ObjectMetadata class returns correct values via dot notation.""" - pid = "hashstore" - ab_id = "hashstoretest" - obj_size = 1234 - hex_digest_dict = { - "md5": "md5value", - "sha1": "sha1value", - "sha224": "sha224value", - "sha256": "sha256value", - "sha512": "sha512value", - } - object_metadata = ObjectMetadata(pid, ab_id, obj_size, hex_digest_dict) - assert object_metadata.pid == pid - assert object_metadata.cid == ab_id - assert object_metadata.obj_size == obj_size - assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] - assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] - assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"] - assert object_metadata.hex_digests.get("sha256") == hex_digest_dict["sha256"] - assert object_metadata.hex_digests.get("sha512") == hex_digest_dict["sha512"] From fb31bb5d35c568b0aece30aed6051be3528fd7e2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 9 Sep 2024 12:13:10 -0700 Subject: [PATCH 299/420] Revise docstrings in hashstore interface for accuracy --- src/hashstore/hashstore.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 90be1da7..6b434a1c 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -28,19 +28,20 @@ def store_object( ): """Atomic storage of objects to disk using a given stream. Upon successful storage, it returns an `ObjectMetadata` object containing relevant file information, such as - the file's id, the file's size, and a hex digest dictionary of algorithms and checksums. - The method also tags the object, creating references for discoverability. + a persistent identifier that references the data file, the file's size, and a hex digest + dictionary of algorithms and checksums. The method also tags the object, creating + references for discoverability. `store_object` ensures that an object is stored only once by synchronizing multiple calls and rejecting attempts to store duplicate objects. If called without a pid, it stores the object without tagging, and it becomes the caller's responsibility to finalize the process by calling `tag_object` after verifying the correct object is stored. - The file's id is determined by calculating the object's content identifier based on the - store's default algorithm, which is also the permanent address of the file. The file's - identifier is then sharded using the store's configured depth and width, delimited by '/', - and concatenated to produce the final permanent address. This address is stored in the - `/store_directory/objects/` directory. + The file's permanent address is determined by calculating the object's content identifier + based on the store's default algorithm, which is also the permanent address of the file. + The content identifier is then sharded using the store's configured depth and width, + delimited by '/', and concatenated to produce the final permanent address. This address + is stored in the `/store_directory/objects/` directory. By default, the hex digest map includes common hash algorithms (md5, sha1, sha256, sha384, sha512). If an additional algorithm is provided, the method checks if it is supported and @@ -59,8 +60,8 @@ def store_object( :param str checksum_algorithm: Algorithm of the supplied checksum. :param int expected_object_size: Size of the object to verify. - :return: ObjectMetadata - Object containing the permanent address, file size, and - hex digest dictionary. + :return: ObjectMetadata - Object containing the persistent identifier (pid), + content identifier (cid), object size and hex digests dictionary (checksums). """ raise NotImplementedError() @@ -113,8 +114,9 @@ def store_metadata(self, pid, metadata, format_id): """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. The `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` to determine the permanent address of the metadata object. All metadata documents for a - given `pid` will be stored in a directory (under ../metadata) that is determined by - calculating the hash of the given pid, with the document name being the hash of the pid + given `pid` will be stored in a directory that follows the HashStore configuration + settings (under ../metadata) that is determined by calculating the hash of the given pid. + Metadata documents are stored in this directory, and is each named using the hash of the pid and metadata format (`pid` + `format_id`). Upon successful storage of metadata, the method returns a string representing the file's From 8411d5a70216a19abf4b2a5e3efdbb55e9fe5992 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 9 Sep 2024 12:13:45 -0700 Subject: [PATCH 300/420] Remove unused import --- src/hashstore/hashstore.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 6b434a1c..f683fb0f 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -1,7 +1,6 @@ """Hashstore Interface""" from abc import ABC, abstractmethod -from dataclasses import dataclass import importlib.metadata import importlib.util From 61933290c2a40d3d5ec4ce3c0d6e55ddf92e1a46 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 9 Sep 2024 12:14:33 -0700 Subject: [PATCH 301/420] Cleanup 'test_filehashstore' for unused variable --- tests/test_filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 487b0e54..825b9273 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1137,7 +1137,7 @@ def test_check_string(store): store._check_string(tab_line, "tab_line") -def test_objectmetadata(store): +def test_objectmetadata(): """Test ObjectMetadata class returns correct values via dot notation.""" pid = "hashstore" ab_id = "hashstoretest" From 78e8af738fb9e4f610ab77da822893ad33f59741 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 09:27:31 -0700 Subject: [PATCH 302/420] Refactor '_validate_properties' in filehashstore to return a properties object where the store depth and width is casted into an integer and add new pytest --- src/hashstore/filehashstore.py | 26 ++++++++++++++++++++++++-- tests/test_hashstore.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2be657c4..c72dfb8a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -392,6 +392,9 @@ def _validate_properties(self, properties): logging.debug(exception_string) raise ValueError(exception_string) + # New dictionary for validated properties + checked_properties = {} + for key in self.property_required_keys: if key not in properties: exception_string = ( @@ -400,14 +403,33 @@ def _validate_properties(self, properties): ) logging.debug(exception_string) raise KeyError(exception_string) - if properties.get(key) is None: + + value = properties.get(key) + if value is None: exception_string = ( "FileHashStore - _validate_properties: Value for key:" + f" {key} is none." ) logging.debug(exception_string) raise ValueError(exception_string) - return properties + + # Add key and values to checked_properties + if key == "store_depth" or key == "store_width": + # Ensure store depth and width are integers + try: + checked_properties[key] = int(value) + except Exception as err: + exception_string = ( + "FileHashStore - _validate_properties: Unexpected exception when" + " attempting to ensure store depth and width are integers. Details: " + + str(err) + ) + logging.debug(exception_string) + raise ValueError(exception_string) + else: + checked_properties[key] = value + + return checked_properties def _set_default_algorithms(self): """Set the default algorithms to calculate when storing objects.""" diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 140d473a..34ba6e15 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -156,3 +156,34 @@ def test_factory_get_hashstore_filehashstore_nonconflicting_dir(factory, tmp_pat } factory.get_hashstore(module_name, class_name, properties) + + +def test_factory_get_hashstore_filehashstore_string_int_prop(factory, tmp_path): + """Check factory does not raise exception when an integer is passed as a string in a + properties object.""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + directory = tmp_path / "douhs" / "inttest" + directory.mkdir(parents=True) + douhspath = (tmp_path / "douhs").as_posix() + + properties = { + "store_path": douhspath, + "store_depth": "3", + "store_width": "2", + "store_algorithm": "SHA-256", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", + } + + factory.get_hashstore(module_name, class_name, properties) + + properties = { + "store_path": douhspath, + "store_depth": str(3), + "store_width": str(2), + "store_algorithm": "SHA-256", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", + } + + factory.get_hashstore(module_name, class_name, properties) From 2869c0c8613dd1849f95120017fbfb4ad6180be5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 10:36:35 -0700 Subject: [PATCH 303/420] Remove 'find_object' from the hashstore interface and re-organize pytests --- src/hashstore/filehashstore.py | 13 +++++ src/hashstore/hashstore.py | 17 ------ tests/test_filehashstore.py | 84 +++++++++++++++++++++++++++ tests/test_filehashstore_interface.py | 80 ------------------------- 4 files changed, 97 insertions(+), 97 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c72dfb8a..45b2e5d7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -778,6 +778,19 @@ def tag_object(self, pid, cid): self.reference_condition.notify() def find_object(self, pid): + """Check if an object referenced by a pid exists and retrieve its content identifier. + The `find_object` method validates the existence of an object based on the provided + pid and returns the associated content identifier. + + :param str pid: Authority-based or persistent identifier of the object. + + :return: obj_info_dict (dict): + - cid: content identifier + - cid_object_path: path to the object + - cid_refs_path: path to the cid refs file + - pid_refs_path: path to the pid refs file + - sysmeta_path: path to the sysmeta file + """ logging.debug( "FileHashStore - find_object: Request to find object for for pid: %s", pid ) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index f683fb0f..0942f14f 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -91,23 +91,6 @@ def verify_object( """ raise NotImplementedError() - @abstractmethod - def find_object(self, pid): - """Check if an object referenced by a pid exists and retrieve its content identifier. - The `find_object` method validates the existence of an object based on the provided - pid and returns the associated content identifier. - - :param str pid: Authority-based or persistent identifier of the object. - - :return: obj_info_dict (dict): - - cid: content identifier - - cid_object_path: path to the object - - cid_refs_path: path to the cid refs file - - pid_refs_path: path to the pid refs file - - sysmeta_path: path to the sysmeta file - """ - raise NotImplementedError() - @abstractmethod def store_metadata(self, pid, metadata, format_id): """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. The diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 825b9273..68431c68 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -6,8 +6,12 @@ import pytest from hashstore.filehashstore import FileHashStore, ObjectMetadata from hashstore.filehashstore_exceptions import ( + CidRefsDoesNotExist, NonMatchingChecksum, NonMatchingObjSize, + PidNotFoundInCidRefsFile, + PidRefsDoesNotExist, + RefsFileExistsButCidObjMissing, UnsupportedAlgorithm, ) @@ -816,6 +820,86 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( ) +def test_find_object(pids, store): + """Test find_object returns the correct content identifier (cid).""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + obj_info_dict = store.find_object(pid) + assert obj_info_dict.get("cid") == object_metadata.hex_digests.get("sha256") + + +def test_find_object_refs_exist_but_obj_not_found(pids, store): + """Test find_object throws exception when refs file exist but the object does not.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, path) + + cid = store.find_object(pid).get("cid") + obj_path = store._resolve_path("objects", cid) + os.remove(obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store.find_object(pid) + + +def test_find_object_cid_refs_not_found(pids, store): + """Test find_object throws exception when pid refs file is found with a cid + but the cid does not exist.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + # Place the wrong cid into the pid refs file that has already been created + pid_ref_abs_path = store._resolve_path("pid", pid) + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + pid_ref_file.seek(0) + pid_ref_file.write("intentionally.wrong.pid") + pid_ref_file.truncate() + + with pytest.raises(CidRefsDoesNotExist): + store.find_object(pid) + + +def test_find_object_cid_refs_does_not_contain_pid(pids, store): + """Test find_object throws exception when pid refs file is found with a cid + but the cid refs file does not contain the pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._resolve_path( + "cid", object_metadata.hex_digests.get("sha256") + ) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store.find_object(pid) + + +def test_find_object_pid_refs_not_found(store): + """Test find object throws exception when object doesn't exist.""" + with pytest.raises(PidRefsDoesNotExist): + store.find_object("dou.test.1") + + +def test_find_object_pid_none(store): + """Test find object throws exception when pid is None.""" + with pytest.raises(ValueError): + store.find_object(None) + + +def test_find_object_pid_empty(store): + """Test find object throws exception when pid is empty.""" + with pytest.raises(ValueError): + store.find_object("") + + def test_clean_algorithm(store): """Check that algorithm values get formatted as expected.""" algorithm_underscore = "sha_256" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 08ecb053..a174cb66 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -716,86 +716,6 @@ def test_store_object_sparse_large_file(store): assert object_metadata_id == object_metadata.hex_digests.get("sha256") -def test_find_object(pids, store): - """Test find_object returns the correct content identifier (cid).""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - obj_info_dict = store.find_object(pid) - assert obj_info_dict.get("cid") == object_metadata.hex_digests.get("sha256") - - -def test_find_object_refs_exist_but_obj_not_found(pids, store): - """Test find_object throws exception when refs file exist but the object does not.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - store.store_object(pid, path) - - cid = store.find_object(pid).get("cid") - obj_path = store._resolve_path("objects", cid) - os.remove(obj_path) - - with pytest.raises(RefsFileExistsButCidObjMissing): - store.find_object(pid) - - -def test_find_object_cid_refs_not_found(pids, store): - """Test find_object throws exception when pid refs file is found with a cid - but the cid does not exist.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) - - # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store._resolve_path("pid", pid) - with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: - pid_ref_file.seek(0) - pid_ref_file.write("intentionally.wrong.pid") - pid_ref_file.truncate() - - with pytest.raises(CidRefsDoesNotExist): - store.find_object(pid) - - -def test_find_object_cid_refs_does_not_contain_pid(pids, store): - """Test find_object throws exception when pid refs file is found with a cid - but the cid refs file does not contain the pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - - # Remove the pid from the cid refs file - cid_ref_abs_path = store._resolve_path( - "cid", object_metadata.hex_digests.get("sha256") - ) - store._update_refs_file(cid_ref_abs_path, pid, "remove") - - with pytest.raises(PidNotFoundInCidRefsFile): - store.find_object(pid) - - -def test_find_object_pid_refs_not_found(store): - """Test find object throws exception when object doesn't exist.""" - with pytest.raises(PidRefsDoesNotExist): - store.find_object("dou.test.1") - - -def test_find_object_pid_none(store): - """Test find object throws exception when pid is None.""" - with pytest.raises(ValueError): - store.find_object(None) - - -def test_find_object_pid_empty(store): - """Test find object throws exception when pid is empty.""" - with pytest.raises(ValueError): - store.find_object("") - - def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" From b54db03bc58b0e2454693df81a49999c244dfdb6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 10:37:19 -0700 Subject: [PATCH 304/420] Cleanup unused code and docstrings --- tests/test_filehashstore_interface.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index a174cb66..06ce0816 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -10,12 +10,9 @@ import pytest from hashstore.filehashstore_exceptions import ( - CidRefsDoesNotExist, NonMatchingChecksum, NonMatchingObjSize, - PidNotFoundInCidRefsFile, PidRefsDoesNotExist, - RefsFileExistsButCidObjMissing, UnsupportedAlgorithm, ) @@ -197,7 +194,7 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): - """Test store object accepts an with additional algo that's supported in lowercase.""" + """Test store object accepts an additional algo that's supported in lowercase.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" From 249f9f85420b4cf6726cd2d15bcf246560d9d423 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 10:38:31 -0700 Subject: [PATCH 305/420] Move placement of 'find_object' function in 'filehashstore' --- src/hashstore/filehashstore.py | 172 ++++++++++++++++----------------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 45b2e5d7..def0141a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -777,92 +777,6 @@ def tag_object(self, pid, cid): self.reference_locked_cids.remove(cid) self.reference_condition.notify() - def find_object(self, pid): - """Check if an object referenced by a pid exists and retrieve its content identifier. - The `find_object` method validates the existence of an object based on the provided - pid and returns the associated content identifier. - - :param str pid: Authority-based or persistent identifier of the object. - - :return: obj_info_dict (dict): - - cid: content identifier - - cid_object_path: path to the object - - cid_refs_path: path to the cid refs file - - pid_refs_path: path to the pid refs file - - sysmeta_path: path to the sysmeta file - """ - logging.debug( - "FileHashStore - find_object: Request to find object for for pid: %s", pid - ) - self._check_string(pid, "pid") - - pid_ref_abs_path = self._resolve_path("pid", pid) - if os.path.exists(pid_ref_abs_path): - # Read the file to get the cid from the pid reference - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - pid_refs_cid = pid_ref_file.read() - - # Confirm that the cid reference file exists - cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) - if os.path.exists(cid_ref_abs_path): - # Check that the pid is actually found in the cid reference file - if self._is_string_in_refs_file(pid, cid_ref_abs_path): - # Object must also exist in order to return the cid retrieved - if not self._exists("objects", pid_refs_cid): - err_msg = ( - f"FileHashStore - find_object: Refs file found for pid ({pid}) at" - + pid_ref_abs_path - + f", but object referenced does not exist, cid: {pid_refs_cid}" - ) - logging.error(err_msg) - raise RefsFileExistsButCidObjMissing(err_msg) - else: - sysmeta_doc_name = self._computehash(pid + self.sysmeta_ns) - metadata_directory = self._computehash(pid) - metadata_rel_path = "/".join(self._shard(metadata_directory)) - sysmeta_full_path = ( - self._get_store_path("metadata") - / metadata_rel_path - / sysmeta_doc_name - ) - obj_info_dict = { - "cid": pid_refs_cid, - "cid_object_path": self._resolve_path( - "objects", pid_refs_cid - ), - "cid_refs_path": cid_ref_abs_path, - "pid_refs_path": pid_ref_abs_path, - "sysmeta_path": ( - sysmeta_full_path - if os.path.isdir(sysmeta_full_path) - else "Does not exist." - ), - } - return obj_info_dict - else: - # If not, it is an orphan pid refs file - err_msg = ( - "FileHashStore - find_object: pid refs file exists with cid: " - + f"{pid_refs_cid} for pid: {pid}" - + f", but is missing from cid refs file: {cid_ref_abs_path}" - ) - logging.error(err_msg) - raise PidNotFoundInCidRefsFile(err_msg) - else: - err_msg = ( - f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" - + f", but cid refs file not found: {cid_ref_abs_path} for pid: {pid}" - ) - logging.error(err_msg) - raise CidRefsDoesNotExist(err_msg) - else: - err_msg = ( - f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " - + pid_ref_abs_path - ) - logging.error(err_msg) - raise PidRefsDoesNotExist(err_msg) - def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid @@ -1445,6 +1359,92 @@ def _store_and_validate_data( ) return object_metadata + def find_object(self, pid): + """Check if an object referenced by a pid exists and retrieve its content identifier. + The `find_object` method validates the existence of an object based on the provided + pid and returns the associated content identifier. + + :param str pid: Authority-based or persistent identifier of the object. + + :return: obj_info_dict (dict): + - cid: content identifier + - cid_object_path: path to the object + - cid_refs_path: path to the cid refs file + - pid_refs_path: path to the pid refs file + - sysmeta_path: path to the sysmeta file + """ + logging.debug( + "FileHashStore - find_object: Request to find object for for pid: %s", pid + ) + self._check_string(pid, "pid") + + pid_ref_abs_path = self._resolve_path("pid", pid) + if os.path.exists(pid_ref_abs_path): + # Read the file to get the cid from the pid reference + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + pid_refs_cid = pid_ref_file.read() + + # Confirm that the cid reference file exists + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + if os.path.exists(cid_ref_abs_path): + # Check that the pid is actually found in the cid reference file + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + # Object must also exist in order to return the cid retrieved + if not self._exists("objects", pid_refs_cid): + err_msg = ( + f"FileHashStore - find_object: Refs file found for pid ({pid}) at" + + pid_ref_abs_path + + f", but object referenced does not exist, cid: {pid_refs_cid}" + ) + logging.error(err_msg) + raise RefsFileExistsButCidObjMissing(err_msg) + else: + sysmeta_doc_name = self._computehash(pid + self.sysmeta_ns) + metadata_directory = self._computehash(pid) + metadata_rel_path = "/".join(self._shard(metadata_directory)) + sysmeta_full_path = ( + self._get_store_path("metadata") + / metadata_rel_path + / sysmeta_doc_name + ) + obj_info_dict = { + "cid": pid_refs_cid, + "cid_object_path": self._resolve_path( + "objects", pid_refs_cid + ), + "cid_refs_path": cid_ref_abs_path, + "pid_refs_path": pid_ref_abs_path, + "sysmeta_path": ( + sysmeta_full_path + if os.path.isdir(sysmeta_full_path) + else "Does not exist." + ), + } + return obj_info_dict + else: + # If not, it is an orphan pid refs file + err_msg = ( + "FileHashStore - find_object: pid refs file exists with cid: " + + f"{pid_refs_cid} for pid: {pid}" + + f", but is missing from cid refs file: {cid_ref_abs_path}" + ) + logging.error(err_msg) + raise PidNotFoundInCidRefsFile(err_msg) + else: + err_msg = ( + f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" + + f", but cid refs file not found: {cid_ref_abs_path} for pid: {pid}" + ) + logging.error(err_msg) + raise CidRefsDoesNotExist(err_msg) + else: + err_msg = ( + f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " + + pid_ref_abs_path + ) + logging.error(err_msg) + raise PidRefsDoesNotExist(err_msg) + def _store_data_only(self, data): """Store an object to HashStore and return the ID and a hex digest dictionary of the default algorithms. This method does not validate the From ebb96f6b45150d5cc5ba08624da7502c86a5070a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 10:43:20 -0700 Subject: [PATCH 306/420] Remove 'find_object' from hashstore client and respective pytests --- src/hashstore/hashstoreclient.py | 22 -------- tests/test_hashstore_client.py | 93 -------------------------------- 2 files changed, 115 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 94292957..9575a908 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -156,12 +156,6 @@ def __init__(self): action="store_true", help="Flag to get the hex digest of a data object in HashStore", ) - self.parser.add_argument( - "-findobject", - dest="client_findobject", - action="store_true", - help="Flag to determine if an object is stored in HashStore", - ) self.parser.add_argument( "-storeobject", dest="client_storeobject", @@ -846,22 +840,6 @@ def main(): print(f"algorithm: {algorithm}") print(f"Checksum/Hex Digest: {digest}") - elif getattr(args, "client_findobject"): - if pid is None: - raise ValueError("'-pid' option is required") - # Find the content identifier of the object - object_info_dict = hashstore_c.hashstore.find_object(pid) - cid = object_info_dict.get("cid") - cid_object_path = object_info_dict.get("cid_object_path") - cid_refs_path = object_info_dict.get("cid_refs_path") - pid_refs_path = object_info_dict.get("pid_refs_path") - sysmeta_path = object_info_dict.get("sysmeta_path") - print(f"Content identifier:\n{cid}") - print(f"Cid Object Path:\n:{cid_object_path}") - print(f"Cid Reference File Path:\n:{cid_refs_path}") - print(f"Pid Reference File Path:\n:{pid_refs_path}") - print(f"Sysmeta Path:\n:{sysmeta_path}") - elif getattr(args, "client_storeobject"): if pid is None: raise ValueError("'-pid' option is required") diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 4038decc..7e13f37a 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -80,99 +80,6 @@ def test_get_checksum(capsys, store, pids): assert capsystext == expected_output -def test_find_object_sysmeta_does_not_exist(capsys, store, pids): - """Test client's find_object prints the required values when sysmeta does not exist.""" - client_directory = os.getcwd() + "/src/hashstore" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - client_module_path = f"{client_directory}/client.py" - test_store = store.root - find_object_opt = "-findobject" - client_pid_arg = f"-pid={pid}" - chs_args = [ - client_module_path, - test_store, - find_object_opt, - client_pid_arg, - ] - - # Add file path of HashStore to sys so modules can be discovered - sys.path.append(client_directory) - # Manually change sys args to simulate command line arguments - sys.argv = chs_args - hashstoreclient.main() - - object_info_dict = store.find_object(pid) - cid = object_info_dict.get("cid") - cid_object_path = object_info_dict.get("cid_object_path") - cid_refs_path = object_info_dict.get("cid_refs_path") - pid_refs_path = object_info_dict.get("pid_refs_path") - sysmeta_path = object_info_dict.get("sysmeta_path") - - capsystext = capsys.readouterr().out - expected_output = ( - f"Content identifier:\n{cid}\n" - + f"Cid Object Path:\n:{cid_object_path}\n" - + f"Cid Reference File Path:\n:{cid_refs_path}\n" - + f"Pid Reference File Path:\n:{pid_refs_path}\n" - + f"Sysmeta Path:\n:{sysmeta_path}\n" - ) - assert capsystext == expected_output - - -def test_find_object_sysmeta_exists(capsys, store, pids): - """Test client's find_object prints the required values when sysmeta exists""" - client_directory = os.getcwd() + "/src/hashstore" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - store.store_metadata(pid, syspath, format_id) - - client_module_path = f"{client_directory}/client.py" - test_store = store.root - find_object_opt = "-findobject" - client_pid_arg = f"-pid={pid}" - chs_args = [ - client_module_path, - test_store, - find_object_opt, - client_pid_arg, - ] - - # Add file path of HashStore to sys so modules can be discovered - sys.path.append(client_directory) - # Manually change sys args to simulate command line arguments - sys.argv = chs_args - hashstoreclient.main() - - object_info_dict = store.find_object(pid) - cid = object_info_dict.get("cid") - cid_object_path = object_info_dict.get("cid_object_path") - cid_refs_path = object_info_dict.get("cid_refs_path") - pid_refs_path = object_info_dict.get("pid_refs_path") - sysmeta_path = object_info_dict.get("sysmeta_path") - - capsystext = capsys.readouterr().out - expected_output = ( - f"Content identifier:\n{cid}\n" - + f"Cid Object Path:\n:{cid_object_path}\n" - + f"Cid Reference File Path:\n:{cid_refs_path}\n" - + f"Pid Reference File Path:\n:{pid_refs_path}\n" - + f"Sysmeta Path:\n:{sysmeta_path}\n" - ) - assert capsystext == expected_output - - def test_store_object(store, pids): """Test storing objects to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" From 91388924af65c417d010b87f3dd58b0f827f3a23 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 11:11:05 -0700 Subject: [PATCH 307/420] Update 'README.md' --- README.md | 325 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 198 insertions(+), 127 deletions(-) diff --git a/README.md b/README.md index 84da9487..9203f171 100644 --- a/README.md +++ b/README.md @@ -7,24 +7,41 @@ - Contact us: support@dataone.org - [DataONE discussions](https://github.com/DataONEorg/dataone/discussions) -HashStore is a server-side python package providing persistent file-based storage using content hashes to de-duplicate data for storing and accessing data and metadata for DataONE services. The package is used in DataONE system components that need direct, filesystem-based access to data objects, their system metadata, and extended metadata about the objects. This package is a core component of the [DataONE federation](https://dataone.org), and supports large-scale object storage for a variety of repositories, including the [KNB Data Repository](http://knb.ecoinformatics.org), the [NSF Arctic Data Center](https://arcticdata.io/catalog/), the [DataONE search service](https://search.dataone.org), and other repositories. - -DataONE in general, and HashStore in particular, are open source, community projects. We [welcome contributions](https://github.com/DataONEorg/hashstore/blob/main/CONTRIBUTING.md) in many forms, including code, graphics, documentation, bug reports, testing, etc. Use the [DataONE discussions](https://github.com/DataONEorg/dataone/discussions) to discuss these contributions with us. - +HashStore is a server-side python package that implements an object storage file system for storing +and accessing data and metadata for DataONE services. The package is used in DataONE system +components that need direct, filesystem-based access to data objects, their system metadata, and +extended metadata about the objects. This package is a core component of +the [DataONE federation](https://dataone.org), and supports large-scale object storage for a variety +of repositories, including the [KNB Data Repository](http://knb.ecoinformatics.org), +the [NSF Arctic Data Center](https://arcticdata.io/catalog/), +the [DataONE search service](https://search.dataone.org), and other repositories. + +DataONE in general, and HashStore in particular, are open source, community projects. +We [welcome contributions](https://github.com/DataONEorg/hashstore/blob/main/CONTRIBUTING.md) in +many forms, including code, graphics, documentation, bug reports, testing, etc. Use +the [DataONE discussions](https://github.com/DataONEorg/dataone/discussions) to discuss these +contributions with us. ## Documentation -Documentation is a work in progress, and can be found on the [Metacat repository](https://github.com/NCEAS/metacat/blob/feature-1436-storage-and-indexing/docs/user/metacat/source/storage-subsystem.rst#physical-file-layout) as part of the storage redesign planning. Future updates will include documentation here as the package matures. +Documentation is a work in progress, and can be found on +the [Metacat repository](https://github.com/NCEAS/metacat/blob/feature-1436-storage-and-indexing/docs/user/metacat/source/storage-subsystem.rst#physical-file-layout) +as part of the storage redesign planning. Future updates will include documentation here as the +package matures. ## HashStore Overview -HashStore is an object storage system that stores data objects based on the their content identifiers. The system stores objects, references (refs) and metadata in its respective directories and provides a content identifier-based API for interacting with the HashStore. HashStore storage classes (like `FileHashStore`) must implement the HashStore interface to ensure the expected usage of HashStore. +HashStore is an object storage system that provides persistent file-based storage using content +hashes to de-duplicate data. The system stores both objects, references (refs) and metadata in its +respective directories and utilizes an identifier-based API for interacting with the store. +HashStore storage classes (like `filehashstore`) must implement the HashStore interface to ensure +the expected usage of HashStore. + +### Public API Methods -###### Public API Methods - store_object - verify_object - tag_object -- find_object - store_metadata - retrieve_object - retrieve_metadata @@ -32,12 +49,14 @@ HashStore is an object storage system that stores data objects based on the thei - delete_metadata - get_hex_digest -For details, please see the HashStore interface (hashstore.py) +For details, please see the HashStore +interface [hashstore.py](https://github.com/DataONEorg/hashstore/blob/main/src/hashstore/hashstore.py) +### How do I create a HashStore? -###### How do I create a HashStore? +To create or interact with a HashStore, instantiate a HashStore object with the following set of +properties: -To create or interact with a HashStore, instantiate a HashStore object with the following set of properties: - store_path - store_depth - store_width @@ -62,173 +81,220 @@ properties = { # Get HashStore from factory module_name = "hashstore.filehashstore" class_name = "FileHashStore" -my_store = hashstore_factory.get_hashstore(module_name, class_name, properties) +hashstore = hashstore_factory.get_hashstore(module_name, class_name, properties) # Store objects (.../[hashstore_path]/objects/) pid = "j.tao.1700.1" object = "/path/to/your/object.data" -object_metadata = my_store.store_object(pid, object) +object_metadata = hashstore.store_object(pid, object) object_cid = object_metadata.cid # Store metadata (.../[hashstore_path]/metadata/) # By default, storing metadata will use the given properties namespace `format_id` pid = "j.tao.1700.1" sysmeta = "/path/to/your/sysmeta/document.xml" -metadata_cid = my_store.store_metadata(pid, sysmeta) +metadata_cid = hashstore.store_metadata(pid, sysmeta) # If you want to store other types of metadata, add an additional `format_id`. pid = "j.tao.1700.1" metadata = "/path/to/your/metadata/document.json" format_id = "http://custom.metadata.com/json/type/v1.0" -metadata_cid = my_store.store_metadata(pid, metadata, format_id) +metadata_cid = hashstore.store_metadata(pid, metadata, format_id) # ... ``` -###### Working with objects (store, retrieve, delete) +### What does HashStore look like? -In HashStore, objects are first saved as temporary files while their content identifiers are calculated. Once the default hash algorithm list and their hashes are generated, objects are stored in their permanent location using the store's algorithm's corresponding hash value, the store depth and the store width. Lastly, reference files are created for the object so that they can be found and retrieved given an identifier (ex. persistent identifier (pid)). Note: Objects are also stored once and only once. +```sh +# Example layout in HashStore with a single file stored along with its metadata and reference files. +# This uses a store depth of 3 (number of nested levels/directories - e.g. '/4d/19/81/' within +# 'objects', see below), with a width of 2 (number of characters used in directory name - e.g. "4d", +# "19" etc.) and "SHA-256" as its default store algorithm +## Notes: +## - Objects are stored using their content identifier as the file address +## - The reference file for each pid contains a single cid +## - The reference file for each cid contains multiple pids each on its own line +## - There are two metadata docs under the metadata directory for the pid (sysmeta, annotations) + +.../metacat/hashstore +├── hashstore.yaml +└── objects +| └── 4d +| └── 19 +| └── 81 +| └── 71eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c +└── metadata +| └── 0d +| └── 55 +| └── 55 +| └── 5ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e +| └── 323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7 +| └── sha256(pid+formatId_annotations) +└── refs + ├── cids + | └── 4d + | └── 19 + | └── 81 + | └── 71eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c + └── pids + └── 0d + └── 55 + └── 55 + └── 5ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e +``` -By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), the client is expected to delete the object directly. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: +### Working with objects (store, retrieve, delete) + +In HashStore, objects are first saved as temporary files while their content identifiers are +calculated. Once the default hash algorithm list and their hashes are generated, objects are stored +in their permanent location using the store's algorithm's corresponding hash value, the store depth +and the store width. Lastly, objects are 'tagged' with a given identifier (ex. persistent +identifier (pid)). This process produces reference files, which allow objects to be found and +retrieved with a given identifier. + +- Note 1: An identifier can only be used once +- Note 2: Each object is stored once and only once using its content identifier (a checksum + generated + from using a hashing algorithm). Clients that attempt to store duplicate objects will receive + the expected ObjectMetadata - with HashStore handling the de-duplication process under the hood. + +By calling the various interface methods for `store_object`, the calling app/client can validate, +store and tag an object simultaneously if the relevant data is available. In the absence of an +identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an +object. The client is then expected to call `verify_object` when the relevant metadata is available +to confirm that the object has been stored as expected. The client is then expected to call +`delete_if_invalid_object` when the relevant metadata is available to confirm that the object is +what is expected. And to finalize the process (to make the object discoverable), the client +calls `tagObject``. In summary, there are two expected paths to store an object: ```py +import io +from hashstore import HashStoreFactory + +# Instantiate a factory +hashstore_factory = HashStoreFactory() + +# Create a properties dictionary with the required fields +properties = { + "store_path": "/path/to/your/store", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", +} + +# Get HashStore from factory +module_name = "hashstore.filehashstore" +class_name = "FileHashStore" +hashstore = hashstore_factory.get_hashstore(module_name, class_name, properties) + +additional_algo = "sha224" +checksum = "sha3_224_checksum_value" +checksum_algo = "sha3_224" +obj_size = 123456 +path = "/path/to/dou.test.1" +input_stream = io.open(path, "rb") +pid = "dou.test.1" # All-in-one process which stores, validates and tags an object -objectMetadata objInfo = store_object(stream, pid, additional_algo, checksum, checksum_algo, objSize) +obj_info_allinone = hashstore.store_object(input_stream, pid, additional_algo, checksum, + checksum_algo, obj_size) # Manual Process # Store object -obj_metadata = store_object(stream) +obj_info_manual = hashstore.store_object(input_stream) # Validate object, throws exceptions if there is a mismatch and deletes the associated file -verify_object(obj_metadata, checksum, checksumAlgorithn, objSize) +hashstore.verify_object(obj_info_manual, checksum, checksum_algo, obj_size) # Tag object, makes the object discoverable (find, retrieve, delete) -tag_object(pid, cid) +hashstore.tag_object(pid, obj_info_manual.cid) ``` **How do I retrieve an object if I have the pid?** -- To retrieve an object, call the Public API method `retrieve_object` which opens a stream to the object if it exists. -**How do I find an object or check that it exists if I have the pid?** -- To check if an object exists, call the Public API method `find_object` which will return the content identifier (cid) of the object if it exists. -- If desired, this cid can then be used to locate the object on disk by following HashStore's store configuration. +- To retrieve an object, call the Public API method `retrieve_object` which opens a stream to the + object if it exists. **How do I delete an object if I have the pid?** -- To delete an object and all its associated reference files, call the Public API method `delete_object` with `id_type` 'pid'. -- To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it is not referenced by any pids. -- Note, `delete_object` and `store_object` are synchronized based on a given 'pid'. An object that is in the process of being stored based on a pid should not be deleted at the same time. Additionally, `delete_object` further synchronizes with `tag_object` based on a `cid`. Every object is stored once, is unique and shares one cid reference file. The API calls to access this cid reference file must be coordinated to prevent file system locking exceptions. +- To delete an object and all its associated reference files, call the Public API + method `delete_object`. +- Note, `delete_object` and `store_object` are synchronized based on a given 'pid'. An object that + is in the process of being stored based on a pid should not be deleted at the same time. + Additionally, `delete_object` further synchronizes with `tag_object` based on a `cid`. Every + object is stored once, is unique and shares one cid reference file. The API calls to access this + cid reference file must be coordinated to prevent file system locking exceptions. ###### Working with metadata (store, retrieve, delete) -HashStore's '/metadata' directory holds all metadata for objects stored in HashStore. To differentiate between metadata documents for a given object, HashStore includes the 'format_id' (format or namespace of the metadata) when generating the address of the metadata document to store (the hash of the 'pid' + 'format_id'). By default, calling `store_metadata` will use HashStore's default metadata namespace as the 'format_id' when storing metadata. Should the calling app wish to store multiple metadata files about an object, the client app is expected to provide a 'format_id' that represents an object format for the metadata type (ex. `store_metadata(stream, pid, format_id)`). +HashStore's '/metadata' directory holds all metadata for objects stored in HashStore. To +differentiate between metadata documents for a given object, HashStore includes the 'format_id' ( +format or namespace of the metadata) when generating the address of the metadata document to store ( +the hash of the 'pid' + 'format_id'). By default, calling `store_metadata` will use HashStore's +default metadata namespace as the 'format_id' when storing metadata. Should the calling app wish to +store multiple metadata files about an object, the client app is expected to provide a 'format_id' +that represents an object format for the metadata type ( +ex. `store_metadata(stream, pid, format_id)`). **How do I retrieve a metadata file?** -- To find a metadata object, call the Public API method `retrieve_metadata` which returns a stream to the metadata file that's been stored with the default metadata namespace if it exists. -- If there are multiple metadata objects, a 'format_id' must be specified when calling `retrieve_metadata` (ex. `retrieve_metadata(pid, format_id)`) + +- To find a metadata object, call the Public API method `retrieve_metadata` which returns a stream + to the metadata file that's been stored with the default metadata namespace if it exists. +- If there are multiple metadata objects, a 'format_id' must be specified when + calling `retrieve_metadata` (ex. `retrieve_metadata(pid, format_id)`) **How do I delete a metadata file?** -- Like `retrieve_metadata`, call the Public API method `delete_metadata` to delete all metadata documents associated with the given pid. -- If there are multiple metadata objects, and you wish to only delete one type, a 'format_id' must be specified when calling `delete_metadata(pid, format_id)` to ensure the expected metadata object is deleted. +- Like `retrieve_metadata`, call the Public API method `delete_metadata` to delete all metadata + documents associated with the given pid. +- If there are multiple metadata objects, and you wish to only delete one type, a 'format_id' must + be specified when calling `delete_metadata(pid, format_id)` to ensure the expected metadata object + is deleted. + +### What are HashStore reference files? -###### What are HashStore reference files? +HashStore assumes that every object to store has a respective identifier. This identifier is then +used when storing, retrieving and deleting an object. In order to facilitate this process, we create +two types of reference files: -HashStore assumes that every object to store has a respective identifier. This identifier is then used when storing, retrieving and deleting an object. In order to facilitate this process, we create two types of reference files: -- pid (persistent identifier) reference files +- pid (persistent identifier) reference files - cid (content identifier) reference files -These reference files are implemented in HashStore underneath the hood with no expectation for modification from the calling app/client. The one and only exception to this process when the calling client/app does not have an identifier, and solely stores an objects raw bytes in HashStore (calling `store_object(stream)`). +These reference files are implemented in HashStore underneath the hood with no expectation for +modification from the calling app/client. The one and only exception to this process when the +calling client/app does not have an identifier, and solely stores an objects raw bytes in +HashStore (calling `store_object(stream)`). **'pid' Reference Files** + - Pid (persistent identifier) reference files are created when storing an object with an identifier. - Pid reference files are located in HashStores '/refs/pids' directory -- If an identifier is not available at the time of storing an object, the calling app/client must create this association between a pid and the object it represents by calling `tag_object` separately. -- Each pid reference file contains a string that represents the content identifier of the object it references -- Like how objects are stored once and only once, there is also only one pid reference file for each object. +- If an identifier is not available at the time of storing an object, the calling app/client must + create this association between a pid and the object it represents by calling `tag_object` + separately. +- Each pid reference file contains a string that represents the content identifier of the object it + references +- Like how objects are stored once and only once, there is also only one pid reference file for each + object. **'cid' Reference Files** -- Cid (content identifier) reference files are created at the same time as pid reference files when storing an object with an identifier. -- Cid reference files are located in HashStore's '/refs/cids' directory -- A cid reference file is a list of all the pids that reference a cid, delimited by a new line ("\n") character - - -###### What does HashStore look like? -```shell -# Example layout in HashStore with three files stored along with its metadata and reference files. -# This uses a store depth of 3, with a width of 2 and "SHA-256" as its default store algorithm -## Notes: -## - Objects are stored using their content identifier as the file address -## - The reference file for each pid contains a single cid -## - The reference file for each cid contains multiple pids each on its own line - -.../metacat/hashstore/ - ├── hashstore.yaml - ├── objects - | ├── 4d - | │ └── 19 - | │ └── 81 - | | └── 71eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c - | ├── 94 - | │ └── f9 - | │ └── b6 - | | └── c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a - | └── 44 - | └── 73 - | └── 51 - | └── 6a592209cbcd3a7ba4edeebbdb374ee8e4a49d19896fafb8f278dc25fa - └── metadata - | ├── 0d - | │ └── 55 - | │ └── 55 - | | └── 5ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e - | | └── 323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7 - | | └── sha256(pid+formatId_annotations) - | ├── a8 - | │ └── 24 - | │ └── 19 - | | └── 25740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf - | | └── ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689 - | | └── sha256(pid+formatId_annotations) - | └── 7f - | └── 5c - | └── c1 - | └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 - | └── 9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2 - | └── sha256(pid+formatId_annotations) - └── refs - ├── cids - | ├── 4d - | | └── 19 - | | └── 81 - | | └── 71eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c - | ├── 94 - | │ └── f9 - | │ └── b6 - | | └── c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a - | └── 44 - | └── 73 - | └── 51 - | └── 6a592209cbcd3a7ba4edeebbdb374ee8e4a49d19896fafb8f278dc25fa - └── pids - ├── 0d - | └── 55 - | └── 55 - | └── 5ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e - ├── a8 - │ └── 24 - │ └── 19 - | └── 25740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf - └── 7f - └── 5c - └── c1 - └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 -``` +- Cid (content identifier) reference files are created at the same time as pid reference files when + storing an object with an identifier. +- Cid reference files are located in HashStore's '/refs/cids' directory +- A cid reference file is a list of all the pids that reference a cid, delimited by a new line (" + \n") character ## Concurrency in HashStore -HashStore is both thread and process safe, and by default synchronizes calls to store & delete objects/metadata with Python's threading module. If you wish to use multiprocessing to parallelize your application, please declare a global environment variable `USE_MULTIPROCESSING` as `True` before initializing Hashstore. This will direct the relevant Public API calls to synchronize using the Python `multiprocessing` module's locks and conditions. Please see below for example: +HashStore is both thread and process safe, and by default synchronizes calls to store & delete +objects/metadata with Python's threading module. If you wish to use multiprocessing to parallelize +your application, please declare a global environment variable `USE_MULTIPROCESSING` as `True` +before initializing Hashstore. This will direct the relevant Public API calls to synchronize using +the Python `multiprocessing` module's locks and conditions. Please see below for example: ```py +import os + # Set the global environment variable os.environ["USE_MULTIPROCESSING"] = "True" @@ -236,13 +302,14 @@ os.environ["USE_MULTIPROCESSING"] = "True" use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" ``` - ## Development build -HashStore is a python package, and built using the [Python Poetry](https://python-poetry.org) build tool. +HashStore is a python package, and built using the [Python Poetry](https://python-poetry.org) +build tool. -To install `hashstore` locally, create a virtual environment for python 3.9+, -install poetry, and then install or build the package with `poetry install` or `poetry build`, respectively. +To install `hashstore` locally, create a virtual environment for python 3.9+, +install poetry, and then install or build the package with `poetry install` or `poetry build`, +respectively. To run tests, navigate to the root directory and run `pytest -s`. The test suite contains tests that take a longer time to run (relating to the storage of large files) - to execute all tests, run @@ -251,6 +318,7 @@ take a longer time to run (relating to the storage of large files) - to execute ## HashStore Client Client API Options: + - `-getchecksum` (get_hex_digest) - `-findobject` - `-storeobject` @@ -261,6 +329,7 @@ Client API Options: - `-deletemetadata` How to use HashStore client (command line app) + ```sh # Step 0: Install hashstore via poetry to create an executable script $ poetry install @@ -271,9 +340,6 @@ $ hashstore /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.tes # Get the checksum of a data object $ hashstore /path/to/store/ -getchecksum -pid=persistent_identifier -algo=SHA-256 -# Find an object (returns the content identifier) -$ hashstore /path/to/store/ -findobject -pid=persistent_identifier - # Store a data object $ hashstore /path/to/store/ -storeobject -pid=persistent_identifier -path=/path/to/object @@ -294,6 +360,7 @@ $ hashstore /path/to/store/ -deletemetadata -pid=persistent_identifier -formatid ``` ## License + ``` Copyright [2022] [Regents of the University of California] @@ -311,12 +378,16 @@ limitations under the License. ``` ## Acknowledgements + Work on this package was supported by: - DataONE Network -- Arctic Data Center: NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. Dozier +- Arctic Data Center: NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. + Dozier -Additional support was provided for collaboration by the National Center for Ecological Analysis and Synthesis, a Center funded by the University of California, Santa Barbara, and the State of California. +Additional support was provided for collaboration by the National Center for Ecological Analysis and +Synthesis, a Center funded by the University of California, Santa Barbara, and the State of +California. [![DataONE_footer](https://user-images.githubusercontent.com/6643222/162324180-b5cf0f5f-ae7a-4ca6-87c3-9733a2590634.png)](https://dataone.org) From 32b9db8edab00e79948dec48545eb4612c75756e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 14:04:05 -0700 Subject: [PATCH 308/420] Fix formatting inconsistencies caused due to pycharm settings --- src/hashstore/filehashstore.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index def0141a..721ff913 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -420,9 +420,9 @@ def _validate_properties(self, properties): checked_properties[key] = int(value) except Exception as err: exception_string = ( - "FileHashStore - _validate_properties: Unexpected exception when" - " attempting to ensure store depth and width are integers. Details: " - + str(err) + "FileHashStore - _validate_properties: Unexpected exception when" + " attempting to ensure store depth and width are integers. Details: " + + str(err) ) logging.debug(exception_string) raise ValueError(exception_string) @@ -1392,9 +1392,9 @@ def find_object(self, pid): # Object must also exist in order to return the cid retrieved if not self._exists("objects", pid_refs_cid): err_msg = ( - f"FileHashStore - find_object: Refs file found for pid ({pid}) at" - + pid_ref_abs_path - + f", but object referenced does not exist, cid: {pid_refs_cid}" + f"FileHashStore - find_object: Refs file found for pid ({pid}) at" + + pid_ref_abs_path + + f", but object referenced does not exist, cid: {pid_refs_cid}" ) logging.error(err_msg) raise RefsFileExistsButCidObjMissing(err_msg) @@ -1403,9 +1403,9 @@ def find_object(self, pid): metadata_directory = self._computehash(pid) metadata_rel_path = "/".join(self._shard(metadata_directory)) sysmeta_full_path = ( - self._get_store_path("metadata") - / metadata_rel_path - / sysmeta_doc_name + self._get_store_path("metadata") + / metadata_rel_path + / sysmeta_doc_name ) obj_info_dict = { "cid": pid_refs_cid, @@ -1424,23 +1424,23 @@ def find_object(self, pid): else: # If not, it is an orphan pid refs file err_msg = ( - "FileHashStore - find_object: pid refs file exists with cid: " - + f"{pid_refs_cid} for pid: {pid}" - + f", but is missing from cid refs file: {cid_ref_abs_path}" + "FileHashStore - find_object: pid refs file exists with cid: " + + f"{pid_refs_cid} for pid: {pid}" + + f", but is missing from cid refs file: {cid_ref_abs_path}" ) logging.error(err_msg) raise PidNotFoundInCidRefsFile(err_msg) else: err_msg = ( - f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" - + f", but cid refs file not found: {cid_ref_abs_path} for pid: {pid}" + f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" + + f", but cid refs file not found: {cid_ref_abs_path} for pid: {pid}" ) logging.error(err_msg) raise CidRefsDoesNotExist(err_msg) else: err_msg = ( - f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " - + pid_ref_abs_path + f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " + + pid_ref_abs_path ) logging.error(err_msg) raise PidRefsDoesNotExist(err_msg) From 7cda167d9604be5ec0d98dcf6aa9461b24b81c2b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 14:28:28 -0700 Subject: [PATCH 309/420] Update 'delete_object' in hashstore interface to only accept a pid --- src/hashstore/hashstore.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 0942f14f..386d38ea 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -139,15 +139,13 @@ def retrieve_metadata(self, pid, format_id): raise NotImplementedError() @abstractmethod - def delete_object(self, ab_id, id_type): + def delete_object(self, pid): """Deletes an object and its related data permanently from HashStore using a given - persistent identifier. If the `id_type` is 'pid', the object associated with the pid will - be deleted if it is not referenced by any other pids, along with its reference files and - all metadata documents found in its respective metadata directory. If the `id_type` is - 'cid', only the object will be deleted if it is not referenced by other pids. + persistent identifier. The object associated with the pid will be deleted if it is not + referenced by any other pids, along with its reference files and all metadata documents + found in its respective metadata directory. - :param str ab_id: Authority-based identifier. - :param str id_type: "pid" or "cid" + :param str pid: Persistent or Authority-based identifier. """ raise NotImplementedError() From 2d674a4fced9d1d3c97d4eb627a6d33d0a1068a6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 14:29:45 -0700 Subject: [PATCH 310/420] Refactor 'delete_object' by extracting method 'delete_object_only' and update signature to match the updated interface --- src/hashstore/filehashstore.py | 366 ++++++++++++++++----------------- 1 file changed, 182 insertions(+), 184 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 721ff913..01eda731 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -897,22 +897,57 @@ def retrieve_metadata(self, pid, format_id=None): logging.error(exception_string) raise ValueError(exception_string) - def delete_object(self, ab_id, id_type=None): + def delete_object(self, pid): logging.debug( - "FileHashStore - delete_object: Request to delete object for id: %s", ab_id + "FileHashStore - delete_object: Request to delete object for id: %s", pid ) - self._check_string(ab_id, "ab_id") + self._check_string(pid, "pid") + + objects_to_delete = [] + + # Storing and deleting objects are synchronized together + # Duplicate store object requests for a pid are rejected, but deleting an object + # will wait for a pid to be released if it's found to be in use before proceeding. + sync_begin_debug_msg = ( + f"FileHashStore - delete_object: Pid ({pid}) to locked list." + ) + sync_wait_msg = ( + f"FileHashStore - delete_object: Pid ({pid}) is locked. Waiting." + ) + if self.use_multiprocessing: + with self.object_condition_mp: + # Wait for the pid to release if it's in use + while pid in self.object_locked_pids_mp: + logging.debug(sync_wait_msg) + self.object_condition_mp.wait() + # Modify object_locked_pids consecutively + logging.debug(sync_begin_debug_msg) + self.object_locked_pids_mp.append(pid) + else: + with self.object_condition: + while pid in self.object_locked_pids: + logging.debug(sync_wait_msg) + self.object_condition.wait() + logging.debug(sync_begin_debug_msg) + self.object_locked_pids.append(pid) + + try: + # Before we begin deletion process, we look for the `cid` by calling + # `find_object` which will throw custom exceptions if there is an issue with + # the reference files, which help us determine the path to proceed with. + try: + object_info_dict = self.find_object(pid) + cid = object_info_dict.get("cid") - if id_type == "cid": - cid_refs_abs_path = self._resolve_path("cid", ab_id) - # If the refs file still exists, do not delete the object - if not os.path.exists(cid_refs_abs_path): - cid = ab_id + # Proceed with next steps - cid has been retrieved without any issues + # We must synchronize here based on the `cid` because multiple threads may + # try to access the `cid_reference_file` sync_begin_debug_msg = ( f"FileHashStore - delete_object: Cid ({cid}) to locked list." ) sync_wait_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) is locked. Waiting." + f"FileHashStore - delete_object: Cid ({cid}) is locked." + + " Waiting." ) if self.use_multiprocessing: with self.reference_condition_mp: @@ -932,7 +967,42 @@ def delete_object(self, ab_id, id_type=None): self.reference_locked_cids.append(cid) try: - self._delete("objects", cid) + cid_ref_abs_path = object_info_dict.get("cid_refs_path") + pid_ref_abs_path = object_info_dict.get("pid_refs_path") + # Add pid refs file to be permanently deleted + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove pid from cid reference file + self._update_refs_file(cid_ref_abs_path, pid, "remove") + # Delete cid reference file and object only if the cid refs file is empty + if os.path.getsize(cid_ref_abs_path) == 0: + debug_msg = ( + "FileHashStore - delete_object: cid_refs_file is empty (size == 0):" + + f" {cid_ref_abs_path} - deleting cid refs file and data object." + ) + logging.debug(debug_msg) + objects_to_delete.append( + self._rename_path_for_deletion(cid_ref_abs_path) + ) + obj_real_path = object_info_dict.get("cid_object_path") + objects_to_delete.append( + self._rename_path_for_deletion(obj_real_path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + + # Remove metadata files if they exist + self.delete_metadata(pid) + + info_string = ( + "FileHashStore - delete_object: Successfully deleted references," + + f" metadata and object associated with pid: {pid}" + ) + logging.info(info_string) + return + finally: # Release cid end_sync_debug_msg = ( @@ -949,201 +1019,129 @@ def delete_object(self, ab_id, id_type=None): logging.debug(end_sync_debug_msg) self.reference_locked_cids.remove(cid) self.reference_condition.notify() - else: - # id_type is "pid" - pid = ab_id - objects_to_delete = [] - # Storing and deleting objects are synchronized together - # Duplicate store object requests for a pid are rejected, but deleting an object - # will wait for a pid to be released if it's found to be in use before proceeding. + except PidRefsDoesNotExist: + warn_msg = ( + "FileHashStore - delete_object: pid refs file does not exist for pid: " + + ab_id + + ". Skipping object deletion. Deleting pid metadata documents." + ) + logging.warning(warn_msg) + + # Remove metadata files if they exist + self.delete_metadata(pid) + + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + except CidRefsDoesNotExist: + # Delete pid refs file + objects_to_delete.append( + self._rename_path_for_deletion(self._resolve_path("pid", pid)) + ) + # Remove metadata files if they exist + self.delete_metadata(pid) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + except RefsFileExistsButCidObjMissing: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove pid from cid refs file + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + # Retrieve the cid + pid_refs_cid = pid_ref_file.read() + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + # Remove if the pid refs is found + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "remove") + # Remove metadata files if they exist + self.delete_metadata(pid) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + except PidNotFoundInCidRefsFile: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove metadata files if they exist + self.delete_metadata(pid) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + finally: + # Release pid + end_sync_debug_msg = ( + f"FileHashStore - delete_object: Releasing pid ({pid})" + + " from locked list" + ) + if self.use_multiprocessing: + with self.object_condition_mp: + logging.debug(end_sync_debug_msg) + self.object_locked_pids_mp.remove(pid) + self.object_condition_mp.notify() + else: + # Release pid + with self.object_condition: + logging.debug(end_sync_debug_msg) + self.object_locked_pids.remove(pid) + self.object_condition.notify() + + def delete_object_only(self, ab_id): + cid_refs_abs_path = self._resolve_path("cid", ab_id) + # If the refs file still exists, do not delete the object + if not os.path.exists(cid_refs_abs_path): + cid = ab_id sync_begin_debug_msg = ( - f"FileHashStore - delete_object: Pid ({pid}) to locked list." + f"FileHashStore - delete_object: Cid ({cid}) to locked list." ) sync_wait_msg = ( - f"FileHashStore - delete_object: Pid ({pid}) is locked. Waiting." + f"FileHashStore - delete_object: Cid ({cid}) is locked. Waiting." ) if self.use_multiprocessing: - with self.object_condition_mp: - # Wait for the pid to release if it's in use - while pid in self.object_locked_pids_mp: + with self.reference_condition_mp: + # Wait for the cid to release if it's in use + while cid in self.reference_locked_cids_mp: logging.debug(sync_wait_msg) - self.object_condition_mp.wait() - # Modify object_locked_pids consecutively + self.reference_condition_mp.wait() + # Modify reference_locked_cids consecutively logging.debug(sync_begin_debug_msg) - self.object_locked_pids_mp.append(pid) + self.reference_locked_cids_mp.append(cid) else: - with self.object_condition: - while pid in self.object_locked_pids: + with self.reference_condition: + while cid in self.reference_locked_cids: logging.debug(sync_wait_msg) - self.object_condition.wait() + self.reference_condition.wait() logging.debug(sync_begin_debug_msg) - self.object_locked_pids.append(pid) + self.reference_locked_cids.append(cid) try: - # Before we begin deletion process, we look for the `cid` by calling - # `find_object` which will throw custom exceptions if there is an issue with - # the reference files, which help us determine the path to proceed with. - try: - object_info_dict = self.find_object(pid) - cid = object_info_dict.get("cid") - - # Proceed with next steps - cid has been retrieved without any issues - # We must synchronize here based on the `cid` because multiple threads may - # try to access the `cid_reference_file` - sync_begin_debug_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) to locked list." - ) - sync_wait_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) is locked." - + " Waiting." - ) - if self.use_multiprocessing: - with self.reference_condition_mp: - # Wait for the cid to release if it's in use - while cid in self.reference_locked_cids_mp: - logging.debug(sync_wait_msg) - self.reference_condition_mp.wait() - # Modify reference_locked_cids consecutively - logging.debug(sync_begin_debug_msg) - self.reference_locked_cids_mp.append(cid) - else: - with self.reference_condition: - while cid in self.reference_locked_cids: - logging.debug(sync_wait_msg) - self.reference_condition.wait() - logging.debug(sync_begin_debug_msg) - self.reference_locked_cids.append(cid) - - try: - cid_ref_abs_path = object_info_dict.get("cid_refs_path") - pid_ref_abs_path = object_info_dict.get("pid_refs_path") - # Add pid refs file to be permanently deleted - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove pid from cid reference file - self._update_refs_file(cid_ref_abs_path, pid, "remove") - # Delete cid reference file and object only if the cid refs file is empty - if os.path.getsize(cid_ref_abs_path) == 0: - debug_msg = ( - "FileHashStore - delete_object: cid_refs_file is empty (size == 0):" - + f" {cid_ref_abs_path} - deleting cid refs file and data object." - ) - logging.debug(debug_msg) - objects_to_delete.append( - self._rename_path_for_deletion(cid_ref_abs_path) - ) - obj_real_path = object_info_dict.get("cid_object_path") - objects_to_delete.append( - self._rename_path_for_deletion(obj_real_path) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - - # Remove metadata files if they exist - self.delete_metadata(pid) - - info_string = ( - "FileHashStore - delete_object: Successfully deleted references," - + f" metadata and object associated with pid: {pid}" - ) - logging.info(info_string) - return - - finally: - # Release cid - end_sync_debug_msg = ( - f"FileHashStore - delete_object: Releasing cid ({cid})" - + " from locked list" - ) - if self.use_multiprocessing: - with self.reference_condition_mp: - logging.debug(end_sync_debug_msg) - self.reference_locked_cids_mp.remove(cid) - self.reference_condition_mp.notify() - else: - with self.reference_condition: - logging.debug(end_sync_debug_msg) - self.reference_locked_cids.remove(cid) - self.reference_condition.notify() - - except PidRefsDoesNotExist: - warn_msg = ( - "FileHashStore - delete_object: pid refs file does not exist for pid: " - + ab_id - + ". Skipping object deletion. Deleting pid metadata documents." - ) - logging.warning(warn_msg) - - # Remove metadata files if they exist - self.delete_metadata(pid) - - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - except CidRefsDoesNotExist: - # Delete pid refs file - objects_to_delete.append( - self._rename_path_for_deletion(self._resolve_path("pid", pid)) - ) - # Remove metadata files if they exist - self.delete_metadata(pid) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - except RefsFileExistsButCidObjMissing: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove pid from cid refs file - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - # Retrieve the cid - pid_refs_cid = pid_ref_file.read() - cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) - # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "remove") - # Remove metadata files if they exist - self.delete_metadata(pid) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - except PidNotFoundInCidRefsFile: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove metadata files if they exist - self.delete_metadata(pid) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return + self._delete("objects", cid) finally: - # Release pid + # Release cid end_sync_debug_msg = ( - f"FileHashStore - delete_object: Releasing pid ({pid})" + f"FileHashStore - delete_object: Releasing cid ({cid})" + " from locked list" ) if self.use_multiprocessing: - with self.object_condition_mp: + with self.reference_condition_mp: logging.debug(end_sync_debug_msg) - self.object_locked_pids_mp.remove(pid) - self.object_condition_mp.notify() + self.reference_locked_cids_mp.remove(cid) + self.reference_condition_mp.notify() else: - # Release pid - with self.object_condition: + with self.reference_condition: logging.debug(end_sync_debug_msg) - self.object_locked_pids.remove(pid) - self.object_condition.notify() + self.reference_locked_cids.remove(cid) + self.reference_condition.notify() def delete_metadata(self, pid, format_id=None): logging.debug( From 1f5868db28cdcfa89ff851f8eaa27b9dfee6059b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 14:34:22 -0700 Subject: [PATCH 311/420] Add docstring to 'delete_object_only' method and re-organize pytests --- src/hashstore/filehashstore.py | 12 ++++++++---- tests/test_filehashstore.py | 28 +++++++++++++++++++++++++++ tests/test_filehashstore_interface.py | 28 --------------------------- 3 files changed, 36 insertions(+), 32 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 01eda731..7e3f824a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1023,7 +1023,7 @@ def delete_object(self, pid): except PidRefsDoesNotExist: warn_msg = ( "FileHashStore - delete_object: pid refs file does not exist for pid: " - + ab_id + + pid + ". Skipping object deletion. Deleting pid metadata documents." ) logging.warning(warn_msg) @@ -1096,11 +1096,15 @@ def delete_object(self, pid): self.object_locked_pids.remove(pid) self.object_condition.notify() - def delete_object_only(self, ab_id): - cid_refs_abs_path = self._resolve_path("cid", ab_id) + def delete_object_only(self, cid): + """Attempt to delete an object based on the given content identifier (cid). If the object + has any pids references and/or a cid refs file exists, the object will not be deleted. + + :param str cid: Content identifier + """ + cid_refs_abs_path = self._resolve_path("cid", cid) # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): - cid = ab_id sync_begin_debug_msg = ( f"FileHashStore - delete_object: Cid ({cid}) to locked list." ) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 68431c68..e7572280 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1023,6 +1023,34 @@ def test_open_objects(pids, store): io_buffer.close() +def test_delete_object_only(pids, store): + """Test delete_object successfully deletes only object.""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid=None, data=path) + store.delete_object_only(object_metadata.cid) + assert store._count(entity) == 0 + + +def test_delete_object_only_cid_refs_file_exists(pids, store): + """Test delete_object does not delete object if a cid refs file still exists.""" + test_dir = "tests/testdata/" + entity = "objects" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object_only(object_metadata.cid) + assert store._count(entity) == 3 + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + def test_delete_with_object_metadata_id(pids, store): """Check objects are deleted after calling delete with object id.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 06ce0816..159755a4 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1084,34 +1084,6 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): assert os.path.exists(cid_refs_file_path) -def test_delete_object_idtype_cid(pids, store): - """Test delete_object successfully deletes only object.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid=None, data=path) - store.delete_object(object_metadata.cid, "cid") - assert store._count(entity) == 0 - - -def test_delete_object_idtype_cid_refs_file_exists(pids, store): - """Test delete_object does not delete object if a cid refs file still exists.""" - test_dir = "tests/testdata/" - entity = "objects" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) - store.delete_object(object_metadata.cid, "cid") - assert store._count(entity) == 3 - assert store._count("pid") == 3 - assert store._count("cid") == 3 - - def test_delete_object_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" pid = " " From dff873d65ee4c3688656e29b099752c14d1ba3bf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 10 Sep 2024 14:36:13 -0700 Subject: [PATCH 312/420] Re-organize 'delete_object_only' method in filehashstore --- src/hashstore/filehashstore.py | 102 ++++++++++++++++----------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7e3f824a..bae3420e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1096,57 +1096,6 @@ def delete_object(self, pid): self.object_locked_pids.remove(pid) self.object_condition.notify() - def delete_object_only(self, cid): - """Attempt to delete an object based on the given content identifier (cid). If the object - has any pids references and/or a cid refs file exists, the object will not be deleted. - - :param str cid: Content identifier - """ - cid_refs_abs_path = self._resolve_path("cid", cid) - # If the refs file still exists, do not delete the object - if not os.path.exists(cid_refs_abs_path): - sync_begin_debug_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) to locked list." - ) - sync_wait_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) is locked. Waiting." - ) - if self.use_multiprocessing: - with self.reference_condition_mp: - # Wait for the cid to release if it's in use - while cid in self.reference_locked_cids_mp: - logging.debug(sync_wait_msg) - self.reference_condition_mp.wait() - # Modify reference_locked_cids consecutively - logging.debug(sync_begin_debug_msg) - self.reference_locked_cids_mp.append(cid) - else: - with self.reference_condition: - while cid in self.reference_locked_cids: - logging.debug(sync_wait_msg) - self.reference_condition.wait() - logging.debug(sync_begin_debug_msg) - self.reference_locked_cids.append(cid) - - try: - self._delete("objects", cid) - finally: - # Release cid - end_sync_debug_msg = ( - f"FileHashStore - delete_object: Releasing cid ({cid})" - + " from locked list" - ) - if self.use_multiprocessing: - with self.reference_condition_mp: - logging.debug(end_sync_debug_msg) - self.reference_locked_cids_mp.remove(cid) - self.reference_condition_mp.notify() - else: - with self.reference_condition: - logging.debug(end_sync_debug_msg) - self.reference_locked_cids.remove(cid) - self.reference_condition.notify() - def delete_metadata(self, pid, format_id=None): logging.debug( "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", @@ -2130,6 +2079,57 @@ def _verify_hashstore_references( logging.error(exception_string) raise CidRefsContentError(exception_string) + def delete_object_only(self, cid): + """Attempt to delete an object based on the given content identifier (cid). If the object + has any pids references and/or a cid refs file exists, the object will not be deleted. + + :param str cid: Content identifier + """ + cid_refs_abs_path = self._resolve_path("cid", cid) + # If the refs file still exists, do not delete the object + if not os.path.exists(cid_refs_abs_path): + sync_begin_debug_msg = ( + f"FileHashStore - delete_object: Cid ({cid}) to locked list." + ) + sync_wait_msg = ( + f"FileHashStore - delete_object: Cid ({cid}) is locked. Waiting." + ) + if self.use_multiprocessing: + with self.reference_condition_mp: + # Wait for the cid to release if it's in use + while cid in self.reference_locked_cids_mp: + logging.debug(sync_wait_msg) + self.reference_condition_mp.wait() + # Modify reference_locked_cids consecutively + logging.debug(sync_begin_debug_msg) + self.reference_locked_cids_mp.append(cid) + else: + with self.reference_condition: + while cid in self.reference_locked_cids: + logging.debug(sync_wait_msg) + self.reference_condition.wait() + logging.debug(sync_begin_debug_msg) + self.reference_locked_cids.append(cid) + + try: + self._delete("objects", cid) + finally: + # Release cid + end_sync_debug_msg = ( + f"FileHashStore - delete_object: Releasing cid ({cid})" + + " from locked list" + ) + if self.use_multiprocessing: + with self.reference_condition_mp: + logging.debug(end_sync_debug_msg) + self.reference_locked_cids_mp.remove(cid) + self.reference_condition_mp.notify() + else: + with self.reference_condition: + logging.debug(end_sync_debug_msg) + self.reference_locked_cids.remove(cid) + self.reference_condition.notify() + @staticmethod def _check_arg_data(data): """Checks a data argument to ensure that it is either a string, path, or stream From eaabe3a25486195697e1ee64c52011d63db7b380 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 10:22:01 -0700 Subject: [PATCH 313/420] Extract '_get_hashstore_pid_refs_path' from '_resolve_path' and update pytests --- src/hashstore/filehashstore.py | 35 +++++++++++++++++--------- tests/test_filehashstore.py | 6 ++--- tests/test_filehashstore_interface.py | 2 +- tests/test_filehashstore_references.py | 12 ++++----- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bae3420e..4a61bdfa 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -650,7 +650,7 @@ def tag_object(self, pid, cid): try: # Prepare files and paths tmp_root_path = self._get_store_path("refs") / "tmp" - pid_refs_path = self._resolve_path("pid", pid) + pid_refs_path = self._get_hashstore_pid_refs_path(pid) cid_refs_path = self._resolve_path("cid", cid) # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' self._create_path(Path(os.path.dirname(pid_refs_path))) @@ -1037,8 +1037,9 @@ def delete_object(self, pid): return except CidRefsDoesNotExist: # Delete pid refs file + pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) objects_to_delete.append( - self._rename_path_for_deletion(self._resolve_path("pid", pid)) + self._rename_path_for_deletion(pid_ref_abs_path) ) # Remove metadata files if they exist self.delete_metadata(pid) @@ -1048,7 +1049,7 @@ def delete_object(self, pid): return except RefsFileExistsButCidObjMissing: # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) + pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -1068,7 +1069,7 @@ def delete_object(self, pid): return except PidNotFoundInCidRefsFile: # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) + pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -1329,7 +1330,7 @@ def find_object(self, pid): ) self._check_string(pid, "pid") - pid_ref_abs_path = self._resolve_path("pid", pid) + pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) if os.path.exists(pid_ref_abs_path): # Read the file to get the cid from the pid reference with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: @@ -2035,7 +2036,7 @@ def _verify_hashstore_references( ) logging.debug(debug_msg) if pid_refs_path is None: - pid_refs_path = self._resolve_path("pid", pid) + pid_refs_path = self._get_hashstore_pid_refs_path(pid) if cid_refs_path is None: cid_refs_path = self._resolve_path("cid", cid) @@ -2455,11 +2456,6 @@ def _resolve_path(self, entity, file): # Note, we skip checking whether the file exists for refs cid_ref_file_abs_path = self._build_path(entity, file) return cid_ref_file_abs_path - elif entity == "pid": - # Note, we skip checking whether the file exists for refs - hash_id = self._computehash(file, self.algorithm) - pid_ref_file_abs_path = self._build_path(entity, hash_id) - return pid_ref_file_abs_path else: exception_string = ( "FileHashStore - _resolve_path: entity must be" @@ -2467,10 +2463,25 @@ def _resolve_path(self, entity, file): ) raise ValueError(exception_string) + def _get_hashstore_pid_refs_path(self, pid): + """Return the expected path to a pid reference file. The path may or may not exist. + + :param str pid: Persistent or authority-based identifier + + :return: Path to pid reference file + :rtype: Path + """ + hash_id = self._computehash(pid, self.algorithm) + root_dir = self._get_store_path("pid") + directories_and_path = self._shard(hash_id) + pid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) + return pid_ref_file_abs_path + def _get_store_path(self, entity): """Return a path object of the root directory of the store. - :param str entity: Desired entity type: "objects" or "metadata" + :param str entity: Desired entity type: "objects", "metadata", "refs", "cid" and "pid". + Note, "cid" and "pid" are refs specific directories. :return: Path to requested store entity type :rtype: Path diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index e7572280..ddb13d0e 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -854,7 +854,7 @@ def test_find_object_cid_refs_not_found(pids, store): _object_metadata = store.store_object(pid, path) # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store._resolve_path("pid", pid) + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: pid_ref_file.seek(0) pid_ref_file.write("intentionally.wrong.pid") @@ -1195,14 +1195,14 @@ def test_resolve_path_metadata(pids, store): assert calculated_metadata_path == metadata_resolved_path -def test_resolve_path_refs_pid(pids, store): +def test_get_hashstore_pid_refs_path(pids, store): """Confirm resolve path returns correct object pid refs path""" test_dir = "tests/testdata/" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) _object_metadata = store.store_object(pid, path) - resolved_pid_ref_abs_path = store._resolve_path("pid", pid) + resolved_pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) pid_refs_metadata_hashid = store._computehash(pid) calculated_pid_ref_path = ( store.pids + "/" + "/".join(store._shard(pid_refs_metadata_hashid)) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 159755a4..29f247ba 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1049,7 +1049,7 @@ def test_delete_object_pid_refs_file_deleted(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - pid_refs_file_path = store._resolve_path("pid", pid) + pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) assert not os.path.exists(pid_refs_file_path) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index e6d7c0f7..30366b33 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -36,7 +36,7 @@ def test_tag_object_pid_refs_file_exists(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store._resolve_path("pid", pid) + pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) assert os.path.exists(pid_refs_file_path) @@ -59,7 +59,7 @@ def test_tag_object_pid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store._resolve_path("pid", pid) + pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() assert pid_refs_cid == object_metadata.cid @@ -412,7 +412,7 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Write the pid refs file and move it where it needs to be with a bad cid - pid_ref_abs_path = store._resolve_path("pid", pid) + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) print(pid_ref_abs_path) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" @@ -427,7 +427,7 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when cid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - pid_ref_abs_path = store._resolve_path("pid", pid) + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") @@ -449,7 +449,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs file, both cid and pid refs must be present - pid_ref_abs_path = store._resolve_path("pid", pid) + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") @@ -473,7 +473,7 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs with expected values - pid_ref_abs_path = store._resolve_path("pid", pid) + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") From d42d916e14190e74296ee3dea66760594cd0f1a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 10:35:42 -0700 Subject: [PATCH 314/420] Extract '_get_hashstore_cid_refs_path' from '_resolve_path' and update pytests --- src/hashstore/filehashstore.py | 42 ++++++++++++++++---------- tests/test_filehashstore.py | 8 ++--- tests/test_filehashstore_interface.py | 12 ++++---- tests/test_filehashstore_references.py | 22 +++++++------- 4 files changed, 47 insertions(+), 37 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4a61bdfa..386cefe5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -651,7 +651,7 @@ def tag_object(self, pid, cid): # Prepare files and paths tmp_root_path = self._get_store_path("refs") / "tmp" pid_refs_path = self._get_hashstore_pid_refs_path(pid) - cid_refs_path = self._resolve_path("cid", cid) + cid_refs_path = self._get_hashstore_cid_refs_path(cid) # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' self._create_path(Path(os.path.dirname(pid_refs_path))) self._create_path(Path(os.path.dirname(cid_refs_path))) @@ -702,7 +702,9 @@ def tag_object(self, pid, cid): return True else: # Check if the retrieved cid refs file exists and pid is referenced - retrieved_cid_refs_path = self._resolve_path("cid", pid_refs_cid) + retrieved_cid_refs_path = self._get_hashstore_cid_refs_path( + pid_refs_cid + ) if os.path.exists( retrieved_cid_refs_path ) and self._is_string_in_refs_file(pid, retrieved_cid_refs_path): @@ -1057,10 +1059,10 @@ def delete_object(self, pid): with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: # Retrieve the cid pid_refs_cid = pid_ref_file.read() - cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + cid_ref_abs_str = str(self._get_hashstore_cid_refs_path(pid_refs_cid)) # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "remove") + if self._is_string_in_refs_file(pid, cid_ref_abs_str): + self._update_refs_file(cid_ref_abs_str, pid, "remove") # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion @@ -1337,10 +1339,10 @@ def find_object(self, pid): pid_refs_cid = pid_ref_file.read() # Confirm that the cid reference file exists - cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file - if self._is_string_in_refs_file(pid, cid_ref_abs_path): + if self._is_string_in_refs_file(pid, str(cid_ref_abs_path)): # Object must also exist in order to return the cid retrieved if not self._exists("objects", pid_refs_cid): err_msg = ( @@ -2008,7 +2010,7 @@ def _verify_object_information( else: # Delete the object cid = hex_digests[self.algorithm] - cid_abs_path = self._resolve_path("cid", cid) + cid_abs_path = self._get_hashstore_cid_refs_path(cid) self._delete(entity, cid_abs_path) logging.debug(exception_string) raise NonMatchingChecksum(exception_string) @@ -2038,7 +2040,7 @@ def _verify_hashstore_references( if pid_refs_path is None: pid_refs_path = self._get_hashstore_pid_refs_path(pid) if cid_refs_path is None: - cid_refs_path = self._resolve_path("cid", cid) + cid_refs_path = self._get_hashstore_cid_refs_path(cid) # Check that reference files were created if not os.path.exists(pid_refs_path): @@ -2086,7 +2088,7 @@ def delete_object_only(self, cid): :param str cid: Content identifier """ - cid_refs_abs_path = self._resolve_path("cid", cid) + cid_refs_abs_path = self._get_hashstore_cid_refs_path(cid) # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): sync_begin_debug_msg = ( @@ -2451,11 +2453,6 @@ def _resolve_path(self, entity, file): relpath = os.path.join(rel_root, file) if os.path.isfile(relpath): return relpath - # Check for sharded path. - elif entity == "cid": - # Note, we skip checking whether the file exists for refs - cid_ref_file_abs_path = self._build_path(entity, file) - return cid_ref_file_abs_path else: exception_string = ( "FileHashStore - _resolve_path: entity must be" @@ -2477,8 +2474,21 @@ def _get_hashstore_pid_refs_path(self, pid): pid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) return pid_ref_file_abs_path + def _get_hashstore_cid_refs_path(self, cid): + """Return the expected path to a cid reference file. The path may or may not exist. + + :param str cid: Content identifier + + :return: Path to cid reference file + :rtype: Path + """ + root_dir = self._get_store_path("cid") + directories_and_path = self._shard(cid) + cid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) + return cid_ref_file_abs_path + def _get_store_path(self, entity): - """Return a path object of the root directory of the store. + """Return a path object to the root directory of the requested hashstore directory type :param str entity: Desired entity type: "objects", "metadata", "refs", "cid" and "pid". Note, "cid" and "pid" are refs specific directories. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index ddb13d0e..8642d616 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -873,8 +873,8 @@ def test_find_object_cid_refs_does_not_contain_pid(pids, store): object_metadata = store.store_object(pid, path) # Remove the pid from the cid refs file - cid_ref_abs_path = store._resolve_path( - "cid", object_metadata.hex_digests.get("sha256") + cid_ref_abs_path = store._get_hashstore_cid_refs_path( + object_metadata.hex_digests.get("sha256") ) store._update_refs_file(cid_ref_abs_path, pid, "remove") @@ -1211,7 +1211,7 @@ def test_get_hashstore_pid_refs_path(pids, store): assert resolved_pid_ref_abs_path == calculated_pid_ref_path -def test_resolve_path_refs_cid(pids, store): +def test_get_hashstore_cid_refs_path(pids, store): """Confirm resolve path returns correct object pid refs path""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -1219,7 +1219,7 @@ def test_resolve_path_refs_cid(pids, store): object_metadata = store.store_object(pid, path) cid = object_metadata.cid - resolved_cid_ref_abs_path = store._resolve_path("cid", cid) + resolved_cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid)) assert resolved_cid_ref_abs_path == calculated_cid_ref_path diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 29f247ba..e9a2b441 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -418,7 +418,7 @@ def test_store_object_duplicate_object_references_file_content(pids, store): pid_three = "dou.test.2" store.store_object(pid_three, path) # Confirm the content of the cid refence files - cid_ref_abs_path = store._resolve_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store._get_hashstore_cid_refs_path(pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -555,8 +555,8 @@ def store_object_wrapper(obj_pid, obj_path): assert store._count(entity) == 1 assert store._exists(entity, pids["jtao.1700.1"][store.algorithm]) - cid_refs_path = store._resolve_path( - "cid", "94f9b6c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a" + cid_refs_path = store._get_hashstore_cid_refs_path( + "94f9b6c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a" ) number_of_pids_reffed = 0 with open(cid_refs_path, "r", encoding="utf8") as ref_file: @@ -1065,7 +1065,7 @@ def test_delete_object_cid_refs_file_deleted(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) cid = object_metadata.cid store.delete_object(pid) - cid_refs_file_path = store._resolve_path("cid", cid) + cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) assert not os.path.exists(cid_refs_file_path) @@ -1076,11 +1076,11 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) cid = object_metadata.cid - cid_refs_abs_path = store._resolve_path("cid", cid) + cid_refs_abs_path = store._get_hashstore_cid_refs_path(cid) # pylint: disable=W0212 store._update_refs_file(cid_refs_abs_path, "dou.test.1", "add") store.delete_object(pid) - cid_refs_file_path = store._resolve_path("cid", cid) + cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) assert os.path.exists(cid_refs_file_path) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 30366b33..9d67993c 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -48,7 +48,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.cid store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store._resolve_path("cid", cid) + cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) assert os.path.exists(cid_refs_file_path) @@ -72,7 +72,7 @@ def test_tag_object_cid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) + cid_refs_file_path = store._get_hashstore_cid_refs_path(object_metadata.cid) with open(cid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read().strip() assert pid_refs_cid == pid @@ -91,7 +91,7 @@ def test_tag_object_pid_refs_found_cid_refs_found(pids, store): with pytest.raises(HashStoreRefsAlreadyExists): store.tag_object(pid, cid) - cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) + cid_refs_file_path = store._get_hashstore_cid_refs_path(object_metadata.cid) line_count = 0 with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: for _line in ref_file: @@ -109,7 +109,7 @@ def test_tag_object_pid_refs_found_cid_refs_not_found(store): cid = object_metadata.cid # Manually delete the cid refs file, creating an orphaned pid - cid_ref_abs_path = store._resolve_path("cid", cid) + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) os.remove(cid_ref_abs_path) assert store._count("cid") == 0 @@ -146,7 +146,7 @@ def test_tag_object_pid_refs_not_found_cid_refs_found(store): # Read cid file to confirm cid refs file contains the additional pid line_count = 0 - cid_ref_abs_path = store._resolve_path("cid", cid) + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -215,7 +215,7 @@ def test_verify_object_exception_incorrect_size(pids, store): cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store._resolve_path("cid", cid) + cid_abs_path = store._get_hashstore_cid_refs_path(cid) assert not os.path.exists(cid_abs_path) @@ -237,7 +237,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store._resolve_path("cid", cid) + cid_abs_path = store._get_hashstore_cid_refs_path(cid) assert not os.path.exists(cid_abs_path) @@ -338,7 +338,7 @@ def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): """Test that _update_refs_file throws exception if refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - cid_ref_abs_path = store._resolve_path("cid", cid) + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) with pytest.raises(FileNotFoundError): store._update_refs_file(cid_ref_abs_path, pid, "add") @@ -407,7 +407,7 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): # Write the cid refs file and move it where it needs to be tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") - cid_ref_abs_path = store._resolve_path("cid", cid) + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) print(cid_ref_abs_path) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) @@ -445,7 +445,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): # Get a tmp cid refs file and write the wrong pid into it tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") - cid_ref_abs_path = store._resolve_path("cid", cid) + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs file, both cid and pid refs must be present @@ -469,7 +469,7 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi # Write the wrong pid into a cid refs file and move it where it needs to be tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") - cid_ref_abs_path = store._resolve_path("cid", cid) + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs with expected values From d94b39cf2a2463a58554fd7a3f64f932cee85176 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 11:31:25 -0700 Subject: [PATCH 315/420] Extract '_get_hashstore_data_object_path' from '_resolve_path' --- src/hashstore/filehashstore.py | 89 ++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 386cefe5..75ffd663 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1363,8 +1363,8 @@ def find_object(self, pid): ) obj_info_dict = { "cid": pid_refs_cid, - "cid_object_path": self._resolve_path( - "objects", pid_refs_cid + "cid_object_path": self._get_hashstore_data_object_path( + pid_refs_cid ), "cid_refs_path": cid_ref_abs_path, "pid_refs_path": pid_ref_abs_path, @@ -2299,18 +2299,6 @@ def _computehash(self, stream, algorithm=None): hex_digest = hashobj.hexdigest() return hex_digest - def _exists(self, entity, file): - """Check whether a given file id or path exists on disk. - - :param str entity: Desired entity type (e.g., "objects", "metadata"). - :param str file: The name of the file to check. - - :return: True if the file exists. - :rtype: bool - """ - file_exists = bool(self._resolve_path(entity, file)) - return file_exists - def _shard(self, digest): """Generates a list given a digest of `self.depth` number of tokens with width `self.width` from the first part of the digest plus the remainder. @@ -2337,6 +2325,21 @@ def compact(items): return hierarchical_list + def _exists(self, entity, file): + """Check whether a given file id or path exists on disk. + + :param str entity: Desired entity type (e.g., "objects", "metadata"). + :param str file: The name of the file to check. + + :return: True if the file exists. + :rtype: bool + """ + if entity == "objects": + return bool(self._get_hashstore_data_object_path(file)) + else: + file_exists = bool(self._resolve_path(entity, file)) + return file_exists + def _open(self, entity, file, mode="rb"): """Return open buffer object from given id or path. Caller is responsible for closing the stream. @@ -2348,9 +2351,12 @@ def _open(self, entity, file, mode="rb"): :return: An `io` stream dependent on the `mode`. :rtype: io.BufferedReader """ - realpath = self._resolve_path(entity, file) - if realpath is None: - raise IOError(f"Could not locate file: {file}") + if entity == "objects": + return bool(self._get_hashstore_data_object_path(file)) + else: + realpath = self._resolve_path(entity, file) + if realpath is None: + raise IOError(f"Could not locate file: {file}") # pylint: disable=W1514 # mode defaults to "rb" @@ -2364,9 +2370,12 @@ def _delete(self, entity, file): :param str entity: Desired entity type (ex. "objects", "metadata"). :param str file: Address ID or path of file. """ - realpath = self._resolve_path(entity, file) - if realpath is None: - return None + if entity == "objects": + return bool(self._get_hashstore_data_object_path(file)) + else: + realpath = self._resolve_path(entity, file) + if realpath is None: + return None try: os.remove(realpath) @@ -2437,16 +2446,7 @@ def _resolve_path(self, entity, file): :rtype: str """ # Check for relative path. - if entity == "objects": - rel_root = self.objects - relpath = os.path.join(rel_root, file) - if os.path.isfile(relpath): - return relpath - else: - abspath = self._build_path(entity, file) - if os.path.isfile(abspath): - return abspath - elif entity == "metadata": + if entity == "metadata": if os.path.isfile(file): return file rel_root = self.metadata @@ -2456,10 +2456,37 @@ def _resolve_path(self, entity, file): else: exception_string = ( "FileHashStore - _resolve_path: entity must be" - + " 'objects', 'metadata', 'cid' or 'pid" + + " 'objects', 'metadata', 'cid' or 'pid'. Supplied: " + + entity ) raise ValueError(exception_string) + def _get_hashstore_data_object_path(self, cid): + """Return the expected path to a hashstore data object that exists. + + :param str cid: Content identifier + + :return: Path to the data object referenced by the pid + :rtype: Path + """ + paths = self._shard(cid) + root_dir = self._get_store_path("objects") + absolute_path = os.path.join(root_dir, *paths) + + if os.path.isfile(absolute_path): + return absolute_path + else: + # Check the relative path, for usage convenience + rel_root = self.objects + relpath = os.path.join(rel_root, cid) + if os.path.isfile(relpath): + return relpath + else: + raise FileNotFoundError( + "FileHashStore - hashstore data object does not exist for cid: " + "" + cid + ) + def _get_hashstore_pid_refs_path(self, pid): """Return the expected path to a pid reference file. The path may or may not exist. From 232cd4d75f4ba0439c6acad80d8df588020b494e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 11:35:26 -0700 Subject: [PATCH 316/420] Extract '_get_hashstore_metadata_path' from '_resolve_path' --- src/hashstore/filehashstore.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 75ffd663..bb7d1887 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2461,15 +2461,15 @@ def _resolve_path(self, entity, file): ) raise ValueError(exception_string) - def _get_hashstore_data_object_path(self, cid): + def _get_hashstore_data_object_path(self, cid_or_path): """Return the expected path to a hashstore data object that exists. - :param str cid: Content identifier + :param str cid_or_path: Content identifier :return: Path to the data object referenced by the pid :rtype: Path """ - paths = self._shard(cid) + paths = self._shard(cid_or_path) root_dir = self._get_store_path("objects") absolute_path = os.path.join(root_dir, *paths) @@ -2478,15 +2478,30 @@ def _get_hashstore_data_object_path(self, cid): else: # Check the relative path, for usage convenience rel_root = self.objects - relpath = os.path.join(rel_root, cid) + relpath = os.path.join(rel_root, cid_or_path) if os.path.isfile(relpath): return relpath else: raise FileNotFoundError( "FileHashStore - hashstore data object does not exist for cid: " - "" + cid + "" + cid_or_path ) + def _get_hashstore_metadata_path(self, metacat_cid_or_path): + """Return the expected metadata path to a hashstore metadata object that exists. + + :param str cid: Metadata content identifier or path to check + + :return: Path to the data object referenced by the pid + :rtype: Path + """ + if os.path.isfile(metacat_cid_or_path): + return metacat_cid_or_path + rel_root = self.metadata + relpath = os.path.join(rel_root, metacat_cid_or_path) + if os.path.isfile(relpath): + return relpath + def _get_hashstore_pid_refs_path(self, pid): """Return the expected path to a pid reference file. The path may or may not exist. From 7ed22b463014754fb09954f1cdda0328a64f948b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 12:39:51 -0700 Subject: [PATCH 317/420] Delete '_resolve_path' method and refactor affected code and pytests --- src/hashstore/filehashstore.py | 136 ++++++++++++-------------- tests/test_filehashstore.py | 32 +++--- tests/test_filehashstore_interface.py | 4 +- 3 files changed, 75 insertions(+), 97 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bb7d1887..8f22bd24 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1176,7 +1176,6 @@ def delete_metadata(self, pid, format_id=None): logging.info(info_string) else: # Delete a specific metadata file - entity = "metadata" pid_doc = self._computehash(pid + checked_format_id) # Wait for the pid to release if it's in use sync_begin_debug_msg = ( @@ -1205,7 +1204,7 @@ def delete_metadata(self, pid, format_id=None): self.metadata_locked_docs.append(pid_doc) try: full_path_without_directory = rel_path + "/" + pid_doc - self._delete(entity, full_path_without_directory) + self._delete("metadata", full_path_without_directory) info_string = ( "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" + f" {pid} for format_id: {format_id}" @@ -1496,9 +1495,8 @@ def _move_and_get_checksums( ) # Objects are stored with their content identifier based on the store algorithm - entity = "objects" object_cid = hex_digests.get(self.algorithm) - abs_file_path = self._build_path(entity, object_cid, extension) + abs_file_path = self._build_path("objects", object_cid, extension) # Only move file if it doesn't exist. We do not check before we create the tmp # file and calculate the hex digests because the given checksum could be incorrect. @@ -1508,7 +1506,7 @@ def _move_and_get_checksums( pid, checksum, checksum_algorithm, - entity, + "objects", hex_digests, tmp_file_name, tmp_file_size, @@ -1554,21 +1552,23 @@ def _move_and_get_checksums( + " not be created and/or tagged.", ) logging.debug(debug_msg) - self._delete(entity, abs_file_path) + self._delete("objects", abs_file_path) raise err - - logging.debug( - "FileHashStore - _move_and_get_checksums: Deleting temporary file: %s", - tmp_file_name, - ) - self._delete(entity, tmp_file_name) - err_msg = ( - f"Object has not been stored for pid: {pid} - an unexpected error has occurred" - + f" when moving tmp file to: {object_cid}. Reference files will not be" - + f" created and/or tagged. Error: {err}" - ) - logging.warning("FileHashStore - _move_and_get_checksums: %s", err_msg) - raise + else: + logging.debug( + "FileHashStore - _move_and_get_checksums: Deleting temporary file: %s", + tmp_file_name, + ) + self._delete("tmp", tmp_file_name) + err_msg = ( + f"Object has not been stored for pid: {pid} - an unexpected error has " + f"occurred when moving tmp file to: {object_cid}. Reference files will " + f"not be created and/or tagged. Error: {err}" + ) + logging.warning( + "FileHashStore - _move_and_get_checksums: %s", err_msg + ) + raise else: # If the data object already exists, do not move the file but attempt to verify it try: @@ -1576,7 +1576,7 @@ def _move_and_get_checksums( pid, checksum, checksum_algorithm, - entity, + "objects", hex_digests, tmp_file_name, tmp_file_size, @@ -1601,9 +1601,10 @@ def _move_and_get_checksums( logging.debug(exception_string) raise NonMatchingChecksum(exception_string) from nmce finally: - # Delete the temporary file, the data object already exists, so it is redundant - # No exception is thrown so 'store_object' can proceed to tag object - self._delete(entity, tmp_file_name) + # Ensure that the tmp file has been removed, the data object already exists, so it + # is redundant. No exception is thrown so 'store_object' can proceed to tag object + if os.path.exists(tmp_file_name): + self._delete("tmp", tmp_file_name) return object_cid, tmp_file_size, hex_digests @@ -2335,10 +2336,12 @@ def _exists(self, entity, file): :rtype: bool """ if entity == "objects": - return bool(self._get_hashstore_data_object_path(file)) - else: - file_exists = bool(self._resolve_path(entity, file)) - return file_exists + try: + return bool(self._get_hashstore_data_object_path(file)) + except FileNotFoundError: + return False + if entity == "metadata": + return bool(self._get_hashstore_metadata_path(file)) def _open(self, entity, file, mode="rb"): """Return open buffer object from given id or path. Caller is responsible @@ -2351,12 +2354,13 @@ def _open(self, entity, file, mode="rb"): :return: An `io` stream dependent on the `mode`. :rtype: io.BufferedReader """ + realpath = None if entity == "objects": - return bool(self._get_hashstore_data_object_path(file)) - else: - realpath = self._resolve_path(entity, file) - if realpath is None: - raise IOError(f"Could not locate file: {file}") + realpath = self._get_hashstore_data_object_path(file) + if entity == "metadata": + realpath = self._get_hashstore_metadata_path(file) + if realpath is None: + raise IOError(f"Could not locate file: {file}") # pylint: disable=W1514 # mode defaults to "rb" @@ -2370,21 +2374,27 @@ def _delete(self, entity, file): :param str entity: Desired entity type (ex. "objects", "metadata"). :param str file: Address ID or path of file. """ - if entity == "objects": - return bool(self._get_hashstore_data_object_path(file)) + if entity == "tmp": + realpath = file + elif entity == "objects": + realpath = self._get_hashstore_data_object_path(file) + elif entity == "metadata": + realpath = self._get_hashstore_metadata_path(file) + elif os.path.exists(file): + # Check if the given path is an absolute path + realpath = file else: - realpath = self._resolve_path(entity, file) - if realpath is None: - return None + raise IOError(f"FileHashStore - delete(): Could not locate file: {file}") - try: - os.remove(realpath) - except OSError as err: - exception_string = ( - f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err + if realpath is not None: + try: + os.remove(realpath) + except OSError as err: + exception_string = ( + f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err @staticmethod def _rename_path_for_deletion(path): @@ -2433,38 +2443,10 @@ def _build_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path - def _resolve_path(self, entity, file): - """Attempt to determine the absolute path of a file ID or path through - successive checking of candidate paths - first by checking whether the 'file' - exists, followed by checking the entity type with respect to the file. - - :param str entity: Desired entity type ("objects", "metadata", "cid", "pid"), - where "cid" & "pid" represents resolving the path to the refs files. - :param str file: Name of the file. - - :return: Path to file - :rtype: str - """ - # Check for relative path. - if entity == "metadata": - if os.path.isfile(file): - return file - rel_root = self.metadata - relpath = os.path.join(rel_root, file) - if os.path.isfile(relpath): - return relpath - else: - exception_string = ( - "FileHashStore - _resolve_path: entity must be" - + " 'objects', 'metadata', 'cid' or 'pid'. Supplied: " - + entity - ) - raise ValueError(exception_string) - def _get_hashstore_data_object_path(self, cid_or_path): """Return the expected path to a hashstore data object that exists. - :param str cid_or_path: Content identifier + :param str cid_or_path: Content identifier or path to check :return: Path to the data object referenced by the pid :rtype: Path @@ -2484,13 +2466,13 @@ def _get_hashstore_data_object_path(self, cid_or_path): else: raise FileNotFoundError( "FileHashStore - hashstore data object does not exist for cid: " - "" + cid_or_path + + cid_or_path ) def _get_hashstore_metadata_path(self, metacat_cid_or_path): """Return the expected metadata path to a hashstore metadata object that exists. - :param str cid: Metadata content identifier or path to check + :param str metacat_cid_or_path: Metadata content identifier or path to check :return: Path to the data object referenced by the pid :rtype: Path @@ -2590,6 +2572,8 @@ def _count(self, entity): directory_to_count = self.pids elif entity == "cid": directory_to_count = self.cids + elif entity == "tmp": + directory_to_count = self.objects + "tmp" else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 8642d616..82a43300 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -754,7 +754,7 @@ def test_verify_object_information_incorrect_size_with_pid(pids, store): # pylint: disable=W0212 tmp_file = store._mktmpfile(objects_tmp_folder) assert os.path.isfile(tmp_file.name) - with pytest.raises(ValueError): + with pytest.raises(NonMatchingObjSize): store._verify_object_information( "Test_Pid", checksum, @@ -838,7 +838,7 @@ def test_find_object_refs_exist_but_obj_not_found(pids, store): store.store_object(pid, path) cid = store.find_object(pid).get("cid") - obj_path = store._resolve_path("objects", cid) + obj_path = store._get_hashstore_data_object_path(cid) os.remove(obj_path) with pytest.raises(RefsFileExistsButCidObjMissing): @@ -1077,8 +1077,9 @@ def test_get_real_path_file_does_not_exist(store): """Test get_real_path returns None when object does not exist.""" entity = "objects" test_path = "tests/testdata/helloworld.txt" - real_path_exists = store._resolve_path(entity, test_path) - assert real_path_exists is None + with pytest.raises(FileNotFoundError): + real_path_exists = store._get_hashstore_data_object_path(test_path) + # assert real_path_exists is None def test_get_real_path_with_object_id(store, pids): @@ -1088,7 +1089,7 @@ def test_get_real_path_with_object_id(store, pids): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - obj_abs_path = store._resolve_path(entity, object_metadata.cid) + obj_abs_path = store._get_hashstore_data_object_path(object_metadata.cid) assert os.path.exists(obj_abs_path) @@ -1101,7 +1102,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): object_metadata = store._store_and_validate_data(pid, path) object_metadata_shard = store._shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) - obj_abs_path = store._resolve_path(entity, object_metadata_shard_path) + obj_abs_path = store._get_hashstore_data_object_path(object_metadata_shard_path) assert os.path.exists(obj_abs_path) @@ -1114,21 +1115,10 @@ def test_get_real_path_with_metadata_id(store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_abs_path = store._resolve_path(entity, metadata_cid) + metadata_abs_path = store._get_hashstore_metadata_path(metadata_cid) assert os.path.exists(metadata_abs_path) -def test_get_real_path_with_bad_entity(store, pids): - """Test get_real_path returns absolute path given an object id.""" - test_dir = "tests/testdata/" - entity = "bad_entity" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store._store_and_validate_data(pid, path) - with pytest.raises(ValueError): - store._resolve_path(entity, object_metadata.cid) - - def test_build_path(store, pids): """Test build_abs_path builds the absolute file path.""" test_dir = "tests/testdata/" @@ -1167,7 +1157,7 @@ def test_resolve_path_objects(pids, store): object_metadata = store.store_object(pid, path) cid = object_metadata.cid - obj_resolved_path = store._resolve_path("objects", cid) + obj_resolved_path = store._get_hashstore_data_object_path(cid) calculated_obj_path = store.objects + "/" + "/".join(store._shard(cid)) assert calculated_obj_path == obj_resolved_path @@ -1187,7 +1177,9 @@ def test_resolve_path_metadata(pids, store): rel_path = "/".join(store._shard(metadata_directory)) full_path_without_dir = rel_path + "/" + metadata_document_name - metadata_resolved_path = store._resolve_path("metadata", full_path_without_dir) + metadata_resolved_path = store._get_hashstore_metadata_path( + full_path_without_dir + ) calculated_metadata_path = ( store.metadata + "/" + rel_path + "/" + metadata_document_name ) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index e9a2b441..86cfb83e 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -440,6 +440,8 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) assert store._count(entity) == 1 + # Confirm tmp files created during this process was handled + assert store._count("tmp") == 0 assert store._exists(entity, pids[pid][store.algorithm]) @@ -866,7 +868,7 @@ def test_store_metadata_metadata_path(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_path = store._resolve_path("metadata", metadata_cid) + metadata_path = store._get_hashstore_metadata_path(metadata_cid) assert metadata_cid == metadata_path From aabb690d0826247e5c1fb8401b5998d5b5c37674 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 12:43:39 -0700 Subject: [PATCH 318/420] Rename '_build_path' to '_build_hashstore_data_object_path' and update pytests --- src/hashstore/filehashstore.py | 66 +++++++++++++++++----------------- tests/test_filehashstore.py | 6 ++-- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8f22bd24..611d9eac 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1496,7 +1496,7 @@ def _move_and_get_checksums( # Objects are stored with their content identifier based on the store algorithm object_cid = hex_digests.get(self.algorithm) - abs_file_path = self._build_path("objects", object_cid, extension) + abs_file_path = self._build_hashstore_data_object_path(object_cid, extension) # Only move file if it doesn't exist. We do not check before we create the tmp # file and calculate the hex digests because the given checksum could be incorrect. @@ -2326,6 +2326,35 @@ def compact(items): return hierarchical_list + def _count(self, entity): + """Return the count of the number of files in the `root` directory. + + :param str entity: Desired entity type (ex. "objects", "metadata"). + + :return: Number of files in the directory. + :rtype: int + """ + count = 0 + if entity == "objects": + directory_to_count = self.objects + elif entity == "metadata": + directory_to_count = self.metadata + elif entity == "pid": + directory_to_count = self.pids + elif entity == "cid": + directory_to_count = self.cids + elif entity == "tmp": + directory_to_count = self.objects + "tmp" + else: + raise ValueError( + f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" + ) + + for _, _, files in os.walk(directory_to_count): + for _ in files: + count += 1 + return count + def _exists(self, entity, file): """Check whether a given file id or path exists on disk. @@ -2422,8 +2451,8 @@ def _create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" - def _build_path(self, entity, hash_id, extension=""): - """Build the absolute file path for a given hash ID with an optional file extension. + def _build_hashstore_data_object_path(self, hash_id, extension=""): + """Build the absolute file path for a given content identifier :param str entity: Desired entity type (ex. "objects", "metadata"). :param str hash_id: A hash ID to build a file path for. @@ -2433,7 +2462,7 @@ def _build_path(self, entity, hash_id, extension=""): :rtype: str """ paths = self._shard(hash_id) - root_dir = self._get_store_path(entity) + root_dir = self._get_store_path("objects") if extension and not extension.startswith(os.extsep): extension = os.extsep + extension @@ -2555,35 +2584,6 @@ def _get_file_paths(directory): else: return None - def _count(self, entity): - """Return the count of the number of files in the `root` directory. - - :param str entity: Desired entity type (ex. "objects", "metadata"). - - :return: Number of files in the directory. - :rtype: int - """ - count = 0 - if entity == "objects": - directory_to_count = self.objects - elif entity == "metadata": - directory_to_count = self.metadata - elif entity == "pid": - directory_to_count = self.pids - elif entity == "cid": - directory_to_count = self.cids - elif entity == "tmp": - directory_to_count = self.objects + "tmp" - else: - raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" - ) - - for _, _, files in os.walk(directory_to_count): - for _ in files: - count += 1 - return count - # Other Static Methods @staticmethod diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 82a43300..2a013481 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1119,15 +1119,15 @@ def test_get_real_path_with_metadata_id(store, pids): assert os.path.exists(metadata_abs_path) -def test_build_path(store, pids): - """Test build_abs_path builds the absolute file path.""" +def test_build_hashstore_data_object_path(store, pids): + """Test _build_hashstore_data_object_path builds the hashstore data object file path.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") _ = store._store_and_validate_data(pid, path) # pylint: disable=W0212 - abs_path = store._build_path(entity, pids[pid][store.algorithm]) + abs_path = store._build_hashstore_data_object_path(pids[pid][store.algorithm]) assert os.path.exists(abs_path) From 2321b6f51efc77f648f5bdaaa73eb9cc8a31796c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 12:46:56 -0700 Subject: [PATCH 319/420] Refactor '_build_hashstore_data_object_path' by removing redundant signature value --- src/hashstore/filehashstore.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 611d9eac..1fcc5288 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1496,7 +1496,7 @@ def _move_and_get_checksums( # Objects are stored with their content identifier based on the store algorithm object_cid = hex_digests.get(self.algorithm) - abs_file_path = self._build_hashstore_data_object_path(object_cid, extension) + abs_file_path = self._build_hashstore_data_object_path(object_cid) # Only move file if it doesn't exist. We do not check before we create the tmp # file and calculate the hex digests because the given checksum could be incorrect. @@ -2451,25 +2451,17 @@ def _create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" - def _build_hashstore_data_object_path(self, hash_id, extension=""): + def _build_hashstore_data_object_path(self, hash_id): """Build the absolute file path for a given content identifier - :param str entity: Desired entity type (ex. "objects", "metadata"). :param str hash_id: A hash ID to build a file path for. - :param str extension: An optional file extension to append to the file path. :return: An absolute file path for the specified hash ID. :rtype: str """ paths = self._shard(hash_id) root_dir = self._get_store_path("objects") - - if extension and not extension.startswith(os.extsep): - extension = os.extsep + extension - elif not extension: - extension = "" - - absolute_path = os.path.join(root_dir, *paths) + extension + absolute_path = os.path.join(root_dir, *paths) return absolute_path def _get_hashstore_data_object_path(self, cid_or_path): @@ -2480,12 +2472,10 @@ def _get_hashstore_data_object_path(self, cid_or_path): :return: Path to the data object referenced by the pid :rtype: Path """ - paths = self._shard(cid_or_path) - root_dir = self._get_store_path("objects") - absolute_path = os.path.join(root_dir, *paths) + expected_abs_path = self._build_hashstore_data_object_path(cid_or_path) - if os.path.isfile(absolute_path): - return absolute_path + if os.path.isfile(expected_abs_path): + return expected_abs_path else: # Check the relative path, for usage convenience rel_root = self.objects From 5446ee3cc68804d3564c6828f69b3332ef9b5512 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 13:26:04 -0700 Subject: [PATCH 320/420] Clean up '_get_hashstore_data_object_path' and '_get_hashstore_metadata_path' for clarity --- src/hashstore/filehashstore.py | 120 ++++++++++++++++---------- tests/test_filehashstore_interface.py | 10 +-- 2 files changed, 78 insertions(+), 52 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1fcc5288..c36b42ed 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -821,7 +821,7 @@ def store_metadata(self, pid, metadata, format_id=None): + f" pid: {pid} with format_id: {checked_format_id}" ) logging.info(info_msg) - return metadata_cid + return str(metadata_cid) finally: # Release pid end_sync_debug_msg = ( @@ -1203,7 +1203,10 @@ def delete_metadata(self, pid, format_id=None): logging.debug(sync_begin_debug_msg) self.metadata_locked_docs.append(pid_doc) try: - full_path_without_directory = rel_path + "/" + pid_doc + full_path_without_directory = ( + self.metadata + "/" + rel_path + "/" + pid_doc + ) + print("DOU full_path_without_directory: " + full_path_without_directory) self._delete("metadata", full_path_without_directory) info_string = ( "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" @@ -1846,7 +1849,7 @@ def _put_metadata(self, metadata, pid, metadata_doc_name): :param str metadata_doc_name: Metadata document name :return: Address of the metadata document. - :rtype: str + :rtype: Path """ logging.debug( "FileHashStore - _put_metadata: Request to put metadata for pid: %s", pid @@ -2370,7 +2373,10 @@ def _exists(self, entity, file): except FileNotFoundError: return False if entity == "metadata": - return bool(self._get_hashstore_metadata_path(file)) + try: + return bool(self._get_hashstore_metadata_path(file)) + except FileNotFoundError: + return False def _open(self, entity, file, mode="rb"): """Return open buffer object from given id or path. Caller is responsible @@ -2403,27 +2409,32 @@ def _delete(self, entity, file): :param str entity: Desired entity type (ex. "objects", "metadata"). :param str file: Address ID or path of file. """ - if entity == "tmp": - realpath = file - elif entity == "objects": - realpath = self._get_hashstore_data_object_path(file) - elif entity == "metadata": - realpath = self._get_hashstore_metadata_path(file) - elif os.path.exists(file): - # Check if the given path is an absolute path - realpath = file - else: - raise IOError(f"FileHashStore - delete(): Could not locate file: {file}") + try: + if entity == "tmp": + realpath = file + elif entity == "objects": + realpath = self._get_hashstore_data_object_path(file) + elif entity == "metadata": + realpath = self._get_hashstore_metadata_path(file) + elif os.path.exists(file): + # Check if the given path is an absolute path + realpath = file + else: + raise IOError( + f"FileHashStore - delete(): Could not locate file: {file}" + ) + except FileNotFoundError: + realpath = None - if realpath is not None: - try: + try: + if realpath is not None: os.remove(realpath) - except OSError as err: - exception_string = ( - f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err + except OSError as err: + exception_string = ( + f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err @staticmethod def _rename_path_for_deletion(path): @@ -2464,44 +2475,57 @@ def _build_hashstore_data_object_path(self, hash_id): absolute_path = os.path.join(root_dir, *paths) return absolute_path - def _get_hashstore_data_object_path(self, cid_or_path): - """Return the expected path to a hashstore data object that exists. + def _get_hashstore_data_object_path(self, cid_or_relative_path): + """Get the expected path to a hashstore data object that exists using a content identifier. - :param str cid_or_path: Content identifier or path to check + :param str cid_or_relative_path: Content identifier :return: Path to the data object referenced by the pid :rtype: Path """ - expected_abs_path = self._build_hashstore_data_object_path(cid_or_path) - - if os.path.isfile(expected_abs_path): - return expected_abs_path + expected_abs_data_obj_path = self._build_hashstore_data_object_path( + cid_or_relative_path + ) + if os.path.isfile(expected_abs_data_obj_path): + return expected_abs_data_obj_path else: - # Check the relative path, for usage convenience - rel_root = self.objects - relpath = os.path.join(rel_root, cid_or_path) - if os.path.isfile(relpath): - return relpath + if os.path.isfile(cid_or_relative_path): + # Check whether the supplied arg is an abs path that exists or not for convenience + return cid_or_relative_path else: - raise FileNotFoundError( - "FileHashStore - hashstore data object does not exist for cid: " - + cid_or_path - ) + # Check the relative path + relpath = os.path.join(self.objects, cid_or_relative_path) + if os.path.isfile(relpath): + return relpath + else: + raise FileNotFoundError( + "FileHashStore - _get_hashstore_data_object_path: could not locate a" + + "data object in '/objects' for the supplied cid_or_relative_path: " + + cid_or_relative_path + ) - def _get_hashstore_metadata_path(self, metacat_cid_or_path): + def _get_hashstore_metadata_path(self, metadata_relative_path): """Return the expected metadata path to a hashstore metadata object that exists. - :param str metacat_cid_or_path: Metadata content identifier or path to check + :param str metadata_relative_path: Metadata path to check :return: Path to the data object referenced by the pid :rtype: Path """ - if os.path.isfile(metacat_cid_or_path): - return metacat_cid_or_path - rel_root = self.metadata - relpath = os.path.join(rel_root, metacat_cid_or_path) - if os.path.isfile(relpath): - return relpath + # Form the absolute path to the metadata file + expected_abs_metadata_path = os.path.join(self.metadata, metadata_relative_path) + if os.path.isfile(expected_abs_metadata_path): + return expected_abs_metadata_path + else: + if os.path.isfile(metadata_relative_path): + # Check whether the supplied arg is an abs path that exists or not for convenience + return metadata_relative_path + else: + raise FileNotFoundError( + "FileHashStore - _get_hashstore_metadata_path: could not locate a" + + "metadata object in '/metadata' for the supplied metadata_relative_path: " + + metadata_relative_path + ) def _get_hashstore_pid_refs_path(self, pid): """Return the expected path to a pid reference file. The path may or may not exist. @@ -2511,6 +2535,7 @@ def _get_hashstore_pid_refs_path(self, pid): :return: Path to pid reference file :rtype: Path """ + # The pid refs file is named after the hash of the pid using the store's algorithm hash_id = self._computehash(pid, self.algorithm) root_dir = self._get_store_path("pid") directories_and_path = self._shard(hash_id) @@ -2526,6 +2551,7 @@ def _get_hashstore_cid_refs_path(self, cid): :rtype: Path """ root_dir = self._get_store_path("cid") + # The content identifier is to be split into directories as is supplied directories_and_path = self._shard(cid) cid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) return cid_ref_file_abs_path diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 86cfb83e..60f6eae9 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -731,7 +731,7 @@ def test_store_metadata(pids, store): full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) - assert metadata_cid == full_path + assert metadata_cid == str(full_path) assert store._count(entity) == 3 @@ -756,9 +756,9 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): full_path = store._get_store_path("metadata") / rel_path / metadata_document_name full_path3 = store._get_store_path("metadata") / rel_path / metadata_document_name3 full_path4 = store._get_store_path("metadata") / rel_path / metadata_document_name4 - assert metadata_cid == full_path - assert metadata_cid3 == full_path3 - assert metadata_cid4 == full_path4 + assert metadata_cid == str(full_path) + assert metadata_cid3 == str(full_path3) + assert metadata_cid4 == str(full_path4) assert store._count(entity) == 3 @@ -777,7 +777,7 @@ def test_store_metadata_default_format_id(pids, store): full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) - assert metadata_cid == full_path + assert metadata_cid == str(full_path) def test_store_metadata_files_string(pids, store): From 4218560a915de23fb0bb6e569dd34afb76f9ae79 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 15:31:24 -0700 Subject: [PATCH 321/420] Add context and revise '_shard' docstring so that it is easier to understand --- src/hashstore/filehashstore.py | 37 +++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c36b42ed..72ddb59e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1206,7 +1206,6 @@ def delete_metadata(self, pid, format_id=None): full_path_without_directory = ( self.metadata + "/" + rel_path + "/" + pid_doc ) - print("DOU full_path_without_directory: " + full_path_without_directory) self._delete("metadata", full_path_without_directory) info_string = ( "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" @@ -1264,7 +1263,6 @@ def _store_and_validate_data( self, pid, file, - extension=None, additional_algorithm=None, checksum=None, checksum_algorithm=None, @@ -1275,7 +1273,6 @@ def _store_and_validate_data( :param str pid: Authority-based identifier. :param mixed file: Readable object or path to file. - :param str extension: Optional extension to append to file when saving. :param str additional_algorithm: Optional algorithm value to include when returning hex digests. :param str checksum: Optional checksum to validate object against hex digest before moving @@ -1299,7 +1296,6 @@ def _store_and_validate_data( ) = self._move_and_get_checksums( pid, stream, - extension, additional_algorithm, checksum, checksum_algorithm, @@ -1349,7 +1345,7 @@ def find_object(self, pid): if not self._exists("objects", pid_refs_cid): err_msg = ( f"FileHashStore - find_object: Refs file found for pid ({pid}) at" - + pid_ref_abs_path + + str(pid_ref_abs_path) + f", but object referenced does not exist, cid: {pid_refs_cid}" ) logging.error(err_msg) @@ -1396,7 +1392,7 @@ def find_object(self, pid): else: err_msg = ( f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " - + pid_ref_abs_path + + str(pid_ref_abs_path) ) logging.error(err_msg) raise PidRefsDoesNotExist(err_msg) @@ -1453,7 +1449,6 @@ def _move_and_get_checksums( self, pid, stream, - extension=None, additional_algorithm=None, checksum=None, checksum_algorithm=None, @@ -1469,7 +1464,6 @@ def _move_and_get_checksums( :param str pid: Authority-based identifier. :param Stream stream: Object stream. - :param str extension: Optional extension to append to the file when saving. :param str additional_algorithm: Optional algorithm value to include when returning hex digests. @@ -2015,7 +2009,7 @@ def _verify_object_information( # Delete the object cid = hex_digests[self.algorithm] cid_abs_path = self._get_hashstore_cid_refs_path(cid) - self._delete(entity, cid_abs_path) + self._delete(entity, str(cid_abs_path)) logging.debug(exception_string) raise NonMatchingChecksum(exception_string) @@ -2303,28 +2297,39 @@ def _computehash(self, stream, algorithm=None): hex_digest = hashobj.hexdigest() return hex_digest - def _shard(self, digest): - """Generates a list given a digest of `self.depth` number of tokens with width - `self.width` from the first part of the digest plus the remainder. + def _shard(self, checksum): + """Splits the given checksum into a list of tokens of length `self.width`, followed by + the remainder. + + This method divides the checksum into `self.depth` number of tokens, each with a fixed + width of `self.width`, taken from the beginning of the checksum. Any leftover characters + are added as the final element in the list. Example: + For a checksum of '0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e', + the result may be: ['0d', '55', '5e', 'd77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e'] - :param str digest: The string to be divided into tokens. + :param str checksum: The checksum string to be split into tokens. - :return: A list containing the tokens of fixed width. + :return: A list where each element is a token of fixed width, with any leftover characters as the last element. :rtype: list """ def compact(items): """Return only truthy elements of `items`.""" + # truthy_items = [] + # for item in items: + # if item: + # truthy_items.append(item) + # return truthy_items return [item for item in items if item] # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width :]] + [checksum[i * self.width : self.width * (i + 1)] for i in range(self.depth)] + + [checksum[self.depth * self.width :]] ) return hierarchical_list From 5215cc4ea4861c53cf0053f1be35d1653eabd478 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 16:11:43 -0700 Subject: [PATCH 322/420] Fix docstring length --- src/hashstore/filehashstore.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 72ddb59e..2375e162 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2312,7 +2312,8 @@ def _shard(self, checksum): :param str checksum: The checksum string to be split into tokens. - :return: A list where each element is a token of fixed width, with any leftover characters as the last element. + :return: A list where each element is a token of fixed width, with any leftover + characters as the last element. :rtype: list """ From d1644d0d2d420e2115382cd7d68e0325198c5671 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 16:49:59 -0700 Subject: [PATCH 323/420] Rename 'hashstore' public APi method 'verify_object' to 'delete_invalid_object', revise docstrings, update filehashstore and revise/add new pytests --- README.md | 16 +-- src/hashstore/filehashstore.py | 34 +++--- src/hashstore/hashstore.py | 28 ++--- tests/test_filehashstore_interface.py | 137 +++++++++++++++++++++++++ tests/test_filehashstore_references.py | 108 ------------------- 5 files changed, 178 insertions(+), 145 deletions(-) diff --git a/README.md b/README.md index 9203f171..241990d1 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ store and tag an object simultaneously if the relevant data is available. In the identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. The client is then expected to call -`delete_if_invalid_object` when the relevant metadata is available to confirm that the object is +`delete_invalid_object` when the relevant metadata is available to confirm that the object is what is expected. And to finalize the process (to make the object discoverable), the client calls `tagObject``. In summary, there are two expected paths to store an object: @@ -177,11 +177,11 @@ hashstore_factory = HashStoreFactory() # Create a properties dictionary with the required fields properties = { - "store_path": "/path/to/your/store", - "store_depth": 3, - "store_width": 2, - "store_algorithm": "SHA-256", - "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", + "store_path": "/path/to/your/store", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } # Get HashStore from factory @@ -203,8 +203,8 @@ obj_info_allinone = hashstore.store_object(input_stream, pid, additional_algo, c # Manual Process # Store object obj_info_manual = hashstore.store_object(input_stream) -# Validate object, throws exceptions if there is a mismatch and deletes the associated file -hashstore.verify_object(obj_info_manual, checksum, checksum_algo, obj_size) +# Validate object with expected values when available +hashstore.delete_invalid_object(obj_info_manual, checksum, checksum_algo, obj_size) # Tag object, makes the object discoverable (find, retrieve, delete) hashstore.tag_object(pid, obj_info_manual.cid) ``` diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2375e162..c8315798 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -580,7 +580,7 @@ def store_object( return object_metadata - def verify_object( + def delete_invalid_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): self._check_string(checksum, "checksum") @@ -603,16 +603,24 @@ def verify_object( checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) # Throws exceptions if there's an issue - self._verify_object_information( - pid=None, - checksum=checksum, - checksum_algorithm=checksum_algorithm_checked, - entity="objects", - hex_digests=object_metadata_hex_digests, - tmp_file_name=None, - tmp_file_size=object_metadata_file_size, - file_size_to_validate=expected_file_size, - ) + try: + self._verify_object_information( + pid=None, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + entity="objects", + hex_digests=object_metadata_hex_digests, + tmp_file_name=None, + tmp_file_size=object_metadata_file_size, + file_size_to_validate=expected_file_size, + ) + except NonMatchingObjSize as nmose: + self.delete_object_only(object_metadata.cid) + logging.error(nmose) + raise nmose + except NonMatchingChecksum as mmce: + self.delete_object_only(object_metadata.cid) + raise mmce logging.info( "FileHashStore - verify_object: object has been validated for cid: %s", object_metadata.cid, @@ -2006,10 +2014,6 @@ def _verify_object_information( logging.debug(exception_string_for_pid) raise NonMatchingChecksum(exception_string_for_pid) else: - # Delete the object - cid = hex_digests[self.algorithm] - cid_abs_path = self._get_hashstore_cid_refs_path(cid) - self._delete(entity, str(cid_abs_path)) logging.debug(exception_string) raise NonMatchingChecksum(exception_string) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 386d38ea..13119e09 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -77,20 +77,6 @@ def tag_object(self, pid, cid): """ raise NotImplementedError() - @abstractmethod - def verify_object( - self, object_metadata, checksum, checksum_algorithm, expected_file_size - ): - """Confirm equality of content in an ObjectMetadata. The `verify_object` method verifies - that the content in the provided `object_metadata` matches the specified values. - - :param ObjectMetadata object_metadata: ObjectMetadata object. - :param str checksum: Value of the checksum. - :param str checksum_algorithm: Algorithm of the checksum. - :param int expected_file_size: Size of the temporary file. - """ - raise NotImplementedError() - @abstractmethod def store_metadata(self, pid, metadata, format_id): """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. The @@ -149,6 +135,20 @@ def delete_object(self, pid): """ raise NotImplementedError() + @abstractmethod + def delete_invalid_object( + self, object_metadata, checksum, checksum_algorithm, expected_file_size + ): + """Confirm equality of content in an ObjectMetadata. The `delete_invalid_object` method + will delete a data object if the object_metadata does not match the specified values. + + :param ObjectMetadata object_metadata: ObjectMetadata object. + :param str checksum: Value of the checksum. + :param str checksum_algorithm: Algorithm of the checksum. + :param int expected_file_size: Size of the temporary file. + """ + raise NotImplementedError() + @abstractmethod def delete_metadata(self, pid, format_id): """Deletes a metadata document (ex. `sysmeta`) permanently from HashStore using a given diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 60f6eae9..71ec548e 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1100,6 +1100,143 @@ def test_delete_object_pid_none(store): store.delete_object(pid) +def test_delete_invalid_object(pids, store): + """Test delete_invalid_object does not throw exception given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + store.delete_invalid_object( + object_metadata, checksum, checksum_algorithm, expected_file_size + ) + assert store._exists("objects", object_metadata.cid) + + +def test_delete_invalid_object_supported_other_algo_not_in_default(pids, store): + """Test delete_invalid_object does not throw exception when supported add algo is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + supported_algo = "sha224" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = pids[pid][supported_algo] + expected_file_size = object_metadata.obj_size + store.delete_invalid_object( + object_metadata, checksum, supported_algo, expected_file_size + ) + assert store._exists("objects", object_metadata.cid) + + +def test_delete_invalid_object_exception_incorrect_object_metadata_type(pids, store): + """Test delete_invalid_object throws exception when incorrect class type is given to + object_metadata arg.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.delete_invalid_object( + "not_object_metadata", checksum, checksum_algorithm, expected_file_size + ) + + +def test_delete_invalid_object_exception_incorrect_size(pids, store): + """Test delete_invalid_object throws exception when incorrect size is supplied and that data + object is deleted as we are storing without a pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + + with pytest.raises(NonMatchingObjSize): + store.delete_invalid_object( + object_metadata, checksum, checksum_algorithm, 1000 + ) + + assert not store._exists("objects", object_metadata.cid) + + +def test_delete_invalid_object_exception_incorrect_size_object_exists(pids, store): + """Test delete_invalid_object throws exception when incorrect size is supplied and that data + object is not deleted since it already exists (a cid refs file is present).""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, data=path) + # Store again without pid and wrong object size + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + + with pytest.raises(NonMatchingObjSize): + store.delete_invalid_object( + object_metadata, checksum, checksum_algorithm, 1000 + ) + + assert store._exists("objects", object_metadata.cid) + assert store._count("tmp") == 0 + + +def test_delete_invalid_object_exception_incorrect_checksum(pids, store): + """Test delete_invalid_object throws exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + + with pytest.raises(NonMatchingChecksum): + store.delete_invalid_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) + + assert not store._exists("objects", object_metadata.cid) + + +def test_delete_invalid_object_exception_incorrect_checksum_algo(pids, store): + """Test delete_invalid_object throws exception when unsupported algorithm is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(UnsupportedAlgorithm): + store.delete_invalid_object( + object_metadata, checksum, "md2", expected_file_size + ) + + assert store._exists("objects", object_metadata.cid) + assert store._count("tmp") == 0 + + +def test_delete_invalid_object_exception_supported_other_algo_bad_checksum(pids, store): + """Test delete_invalid_object throws exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(NonMatchingChecksum): + store.delete_invalid_object( + object_metadata, checksum, "sha224", expected_file_size + ) + + assert not store._exists("objects", object_metadata.cid) + + def test_delete_metadata(pids, store): """Test delete_metadata successfully deletes metadata.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 9d67993c..30040292 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -157,114 +157,6 @@ def test_tag_object_pid_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 -def test_verify_object(pids, store): - """Test verify_object does not throw exception given good arguments.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - store.verify_object( - object_metadata, checksum, checksum_algorithm, expected_file_size - ) - - -def test_verify_object_supported_other_algo_not_in_default(pids, store): - """Test verify_object throws exception when incorrect algorithm is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - supported_algo = "sha224" - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = pids[pid][supported_algo] - expected_file_size = object_metadata.obj_size - store.verify_object( - object_metadata, checksum, supported_algo, expected_file_size - ) - - -def test_verify_object_exception_incorrect_object_metadata_type(pids, store): - """Test verify_object throws exception when incorrect class type is given to - object_metadata arg.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - with pytest.raises(ValueError): - store.verify_object( - "bad_type", checksum, checksum_algorithm, expected_file_size - ) - - -def test_verify_object_exception_incorrect_size(pids, store): - """Test verify_object throws exception when incorrect size is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - - with pytest.raises(NonMatchingObjSize): - store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) - - cid = object_metadata.cid - cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store._get_hashstore_cid_refs_path(cid) - assert not os.path.exists(cid_abs_path) - - -def test_verify_object_exception_incorrect_checksum(pids, store): - """Test verify_object throws exception when incorrect checksum is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.cid - store.tag_object(pid, cid) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - - with pytest.raises(NonMatchingChecksum): - store.verify_object( - object_metadata, "abc123", checksum_algorithm, expected_file_size - ) - - cid = object_metadata.cid - cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store._get_hashstore_cid_refs_path(cid) - assert not os.path.exists(cid_abs_path) - - -def test_verify_object_exception_incorrect_checksum_algo(pids, store): - """Test verify_object throws exception when unsupported algorithm is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - expected_file_size = object_metadata.obj_size - with pytest.raises(UnsupportedAlgorithm): - store.verify_object(object_metadata, checksum, "md2", expected_file_size) - - -def test_verify_object_exception_supported_other_algo_bad_checksum(pids, store): - """Test verify_object throws exception when incorrect checksum is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - expected_file_size = object_metadata.obj_size - with pytest.raises(NonMatchingChecksum): - store.verify_object(object_metadata, checksum, "sha224", expected_file_size) - - def test_write_refs_file_ref_type_cid(store): """Test that write_refs_file writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" From 75359496ee4e149a4823e8bcfc9f70934da1087a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 16:51:57 -0700 Subject: [PATCH 324/420] Make 'delete_object_only' a private method by renaming it to '_delete_object_only' and update pytests --- src/hashstore/filehashstore.py | 6 +++--- tests/test_filehashstore.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c8315798..63c5e77a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -615,11 +615,11 @@ def delete_invalid_object( file_size_to_validate=expected_file_size, ) except NonMatchingObjSize as nmose: - self.delete_object_only(object_metadata.cid) + self._delete_object_only(object_metadata.cid) logging.error(nmose) raise nmose except NonMatchingChecksum as mmce: - self.delete_object_only(object_metadata.cid) + self._delete_object_only(object_metadata.cid) raise mmce logging.info( "FileHashStore - verify_object: object has been validated for cid: %s", @@ -2084,7 +2084,7 @@ def _verify_hashstore_references( logging.error(exception_string) raise CidRefsContentError(exception_string) - def delete_object_only(self, cid): + def _delete_object_only(self, cid): """Attempt to delete an object based on the given content identifier (cid). If the object has any pids references and/or a cid refs file exists, the object will not be deleted. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 2a013481..b0998482 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1024,18 +1024,18 @@ def test_open_objects(pids, store): def test_delete_object_only(pids, store): - """Test delete_object successfully deletes only object.""" + """Test _delete_object successfully deletes only object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid=None, data=path) - store.delete_object_only(object_metadata.cid) + store._delete_object_only(object_metadata.cid) assert store._count(entity) == 0 def test_delete_object_only_cid_refs_file_exists(pids, store): - """Test delete_object does not delete object if a cid refs file still exists.""" + """Test _delete_object does not delete object if a cid refs file still exists.""" test_dir = "tests/testdata/" entity = "objects" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1045,7 +1045,7 @@ def test_delete_object_only_cid_refs_file_exists(pids, store): syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - store.delete_object_only(object_metadata.cid) + store._delete_object_only(object_metadata.cid) assert store._count(entity) == 3 assert store._count("pid") == 3 assert store._count("cid") == 3 From ca3f1b8456ea5680ca68f52e14f9e8da0840ed96 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 16:54:09 -0700 Subject: [PATCH 325/420] Make 'find_object' a private method by renaming it to '_find_object' and update pytests --- src/hashstore/filehashstore.py | 8 ++++---- tests/test_filehashstore.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 63c5e77a..9bebddf9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -854,7 +854,7 @@ def retrieve_object(self, pid): ) self._check_string(pid, "pid") - object_info_dict = self.find_object(pid) + object_info_dict = self._find_object(pid) object_cid = object_info_dict.get("cid") entity = "objects" @@ -946,7 +946,7 @@ def delete_object(self, pid): # `find_object` which will throw custom exceptions if there is an issue with # the reference files, which help us determine the path to proceed with. try: - object_info_dict = self.find_object(pid) + object_info_dict = self._find_object(pid) cid = object_info_dict.get("cid") # Proceed with next steps - cid has been retrieved without any issues @@ -1248,7 +1248,7 @@ def get_hex_digest(self, pid, algorithm): entity = "objects" algorithm = self._clean_algorithm(algorithm) - object_cid = self.find_object(pid).get("cid") + object_cid = self._find_object(pid).get("cid") if not self._exists(entity, object_cid): exception_string = ( f"FileHashStore - get_hex_digest: No object found for pid: {pid}" @@ -1319,7 +1319,7 @@ def _store_and_validate_data( ) return object_metadata - def find_object(self, pid): + def _find_object(self, pid): """Check if an object referenced by a pid exists and retrieve its content identifier. The `find_object` method validates the existence of an object based on the provided pid and returns the associated content identifier. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index b0998482..1eca2fe3 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -826,7 +826,7 @@ def test_find_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - obj_info_dict = store.find_object(pid) + obj_info_dict = store._find_object(pid) assert obj_info_dict.get("cid") == object_metadata.hex_digests.get("sha256") @@ -837,12 +837,12 @@ def test_find_object_refs_exist_but_obj_not_found(pids, store): path = test_dir + pid.replace("/", "_") store.store_object(pid, path) - cid = store.find_object(pid).get("cid") + cid = store._find_object(pid).get("cid") obj_path = store._get_hashstore_data_object_path(cid) os.remove(obj_path) with pytest.raises(RefsFileExistsButCidObjMissing): - store.find_object(pid) + store._find_object(pid) def test_find_object_cid_refs_not_found(pids, store): @@ -861,7 +861,7 @@ def test_find_object_cid_refs_not_found(pids, store): pid_ref_file.truncate() with pytest.raises(CidRefsDoesNotExist): - store.find_object(pid) + store._find_object(pid) def test_find_object_cid_refs_does_not_contain_pid(pids, store): @@ -879,25 +879,25 @@ def test_find_object_cid_refs_does_not_contain_pid(pids, store): store._update_refs_file(cid_ref_abs_path, pid, "remove") with pytest.raises(PidNotFoundInCidRefsFile): - store.find_object(pid) + store._find_object(pid) def test_find_object_pid_refs_not_found(store): """Test find object throws exception when object doesn't exist.""" with pytest.raises(PidRefsDoesNotExist): - store.find_object("dou.test.1") + store._find_object("dou.test.1") def test_find_object_pid_none(store): """Test find object throws exception when pid is None.""" with pytest.raises(ValueError): - store.find_object(None) + store._find_object(None) def test_find_object_pid_empty(store): """Test find object throws exception when pid is empty.""" with pytest.raises(ValueError): - store.find_object("") + store._find_object("") def test_clean_algorithm(store): From 93d81ebb66a0cc2a519da6e1c9b0b1559bf0ee1d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 16:59:39 -0700 Subject: [PATCH 326/420] Revise docstrings to resolve linting warnings --- src/hashstore/filehashstore.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9bebddf9..6dccac31 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1772,7 +1772,7 @@ def _write_refs_file(self, path, ref_id, ref_type): def _update_refs_file(self, refs_file_path, ref_id, update_type): """Add or remove an existing ref from a refs file. - :param str refs_file_path: Absolute path to the refs file. + :param path refs_file_path: Absolute path to the refs file. :param str ref_id: Authority-based or persistent identifier of the object. :param str update_type: 'add' or 'remove' """ @@ -1829,7 +1829,7 @@ def _is_string_in_refs_file(ref_id, refs_file_path): """Check a reference file for a ref_id (`cid` or `pid`). :param str ref_id: Authority-based, persistent identifier or content identifier - :param str refs_file_path: Path to the refs file + :param path refs_file_path: Path to the refs file :return: pid_found :rtype: boolean @@ -2030,8 +2030,8 @@ def _verify_hashstore_references( :param str pid: Authority-based or persistent identifier. :param str cid: Content identifier. - :param str pid_refs_path: Path to pid refs file - :param str cid_refs_path: Path to cid refs file + :param path pid_refs_path: Path to pid refs file + :param path cid_refs_path: Path to cid refs file :param str additional_log_string: String to append to exception statement """ debug_msg = ( From 79ab4f08dd2d27a3ee892da163ec530acd736af2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 17:02:32 -0700 Subject: [PATCH 327/420] Cleanup pytests for unused and redundant code --- tests/test_filehashstore.py | 8 +------- tests/test_filehashstore_references.py | 5 +---- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1eca2fe3..6782e15e 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1075,17 +1075,14 @@ def test_create_path(pids, store): def test_get_real_path_file_does_not_exist(store): """Test get_real_path returns None when object does not exist.""" - entity = "objects" test_path = "tests/testdata/helloworld.txt" with pytest.raises(FileNotFoundError): - real_path_exists = store._get_hashstore_data_object_path(test_path) - # assert real_path_exists is None + store._get_hashstore_data_object_path(test_path) def test_get_real_path_with_object_id(store, pids): """Test get_real_path returns absolute path given an object id.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) @@ -1096,7 +1093,6 @@ def test_get_real_path_with_object_id(store, pids): def test_get_real_path_with_object_id_sharded(pids, store): """Test exists method with a sharded path (relative path).""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) @@ -1108,7 +1104,6 @@ def test_get_real_path_with_object_id_sharded(pids, store): def test_get_real_path_with_metadata_id(store, pids): """Test get_real_path returns absolute path given a metadata id.""" - entity = "metadata" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): @@ -1122,7 +1117,6 @@ def test_get_real_path_with_metadata_id(store, pids): def test_build_hashstore_data_object_path(store, pids): """Test _build_hashstore_data_object_path builds the hashstore data object file path.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") _ = store._store_and_validate_data(pid, path) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 30040292..fa117d70 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -8,12 +8,9 @@ CidRefsContentError, CidRefsFileNotFound, HashStoreRefsAlreadyExists, - NonMatchingChecksum, - NonMatchingObjSize, PidAlreadyExistsError, PidRefsContentError, PidRefsFileNotFound, - UnsupportedAlgorithm, ) # pylint: disable=W0212 @@ -79,7 +76,7 @@ def test_tag_object_cid_refs_file_content(pids, store): def test_tag_object_pid_refs_found_cid_refs_found(pids, store): - """Test tag_object does not throws exception when the refs files already exist + """Test tag_object does not throw an exception when any refs file already exists and verifies the content, and does not double tag the cid refs file.""" test_dir = "tests/testdata/" for pid in pids.keys(): From 791c17ea754123106b7756932682311450f28d5c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 11 Sep 2024 17:38:45 -0700 Subject: [PATCH 328/420] Clean up methods '_store_data_only', '_find_object' and '_store_and_validate_data' and review/revise/add pytests --- src/hashstore/filehashstore.py | 18 ++++--- tests/test_filehashstore.py | 94 ++++++++++++++++++---------------- 2 files changed, 60 insertions(+), 52 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6dccac31..ad0bb3cf 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1385,8 +1385,8 @@ def _find_object(self, pid): # If not, it is an orphan pid refs file err_msg = ( "FileHashStore - find_object: pid refs file exists with cid: " - + f"{pid_refs_cid} for pid: {pid}" - + f", but is missing from cid refs file: {cid_ref_abs_path}" + + f"{pid_refs_cid} for pid: {pid} but is missing from cid refs file:" + + str(cid_ref_abs_path) ) logging.error(err_msg) raise PidNotFoundInCidRefsFile(err_msg) @@ -1406,9 +1406,10 @@ def _find_object(self, pid): raise PidRefsDoesNotExist(err_msg) def _store_data_only(self, data): - """Store an object to HashStore and return the ID and a hex digest - dictionary of the default algorithms. This method does not validate the - object and writes directly to `/objects` after the hex digests are calculated. + """Store an object to HashStore and return the a metadata object containing the content + identifier, object file size and hex digests dictionary of the default algorithms. This + method does not validate the object and writes directly to `/objects` after the hex + digests are calculated. :param mixed data: String or path to object. @@ -1429,13 +1430,16 @@ def _store_data_only(self, data): # Get the hex digest dictionary with closing(stream): ( - object_ref_pid_location, + object_cid, obj_file_size, hex_digest_dict, ) = self._move_and_get_checksums(None, stream) object_metadata = ObjectMetadata( - None, object_ref_pid_location, obj_file_size, hex_digest_dict + "HashStoreNoPid", + object_cid, + obj_file_size, + hex_digest_dict, ) # The permanent address of the data stored is based on the data's checksum cid = hex_digest_dict.get(self.algorithm) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 6782e15e..8931b02c 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -227,29 +227,27 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): - """Test _store_and_validate_data with path object for the path arg.""" + """Test _store_and_validate_data accepts path object for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - object_metadata_id = object_metadata.cid - assert store._exists(entity, object_metadata_id) + assert store._exists(entity, object_metadata.cid) def test_store_and_validate_data_files_string(pids, store): - """Test _store_and_validate_data with string for the path arg.""" + """Test _store_and_validate_data accepts string for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - object_metadata_id = object_metadata.cid - assert store._exists(entity, object_metadata_id) + assert store._exists(entity, object_metadata.cid) def test_store_and_validate_data_files_stream(pids, store): - """Test _store_and_validate_data with stream for the path arg.""" + """Test _store_and_validate_data accepts stream for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -257,19 +255,17 @@ def test_store_and_validate_data_files_stream(pids, store): input_stream = io.open(path, "rb") object_metadata = store._store_and_validate_data(pid, input_stream) input_stream.close() - object_metadata_id = object_metadata.cid - assert store._exists(entity, object_metadata_id) + assert store._exists(entity, object_metadata.cid) assert store._count(entity) == 3 def test_store_and_validate_data_cid(pids, store): - """Check _store_and_validate_data returns correct id.""" + """Check _store_and_validate_data returns the expected content identifier""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - object_metadata_id = object_metadata.cid - assert object_metadata_id == pids[pid][store.algorithm] + assert object_metadata.cid == pids[pid][store.algorithm] def test_store_and_validate_data_file_size(pids, store): @@ -278,8 +274,7 @@ def test_store_and_validate_data_file_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - object_size = object_metadata.obj_size - assert object_size == pids[pid]["file_size_bytes"] + assert object_metadata.obj_size == pids[pid]["file_size_bytes"] def test_store_and_validate_data_hex_digests(pids, store): @@ -288,17 +283,16 @@ def test_store_and_validate_data_hex_digests(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - object_metadata_hex_digests = object_metadata.hex_digests - assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] - assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] - assert object_metadata_hex_digests.get("sha256") == pids[pid]["sha256"] - assert object_metadata_hex_digests.get("sha384") == pids[pid]["sha384"] - assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] + assert object_metadata.hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata.hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata.hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata.hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata.hex_digests.get("sha512") == pids[pid]["sha512"] def test_store_and_validate_data_additional_algorithm(pids, store): - """Check _store_and_validate_data returns additional algorithm in hex digests - when provided an additional algo value.""" + """Check _store_and_validate_data returns an additional algorithm in hex digests + when provided with an additional algo value.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -306,13 +300,13 @@ def test_store_and_validate_data_additional_algorithm(pids, store): object_metadata = store._store_and_validate_data( pid, path, additional_algorithm=algo ) - hex_digests = object_metadata.hex_digests - sha224_hash = hex_digests.get(algo) + sha224_hash = object_metadata.hex_digests.get(algo) assert sha224_hash == pids[pid][algo] def test_store_and_validate_data_with_correct_checksums(pids, store): - """Check _store_and_validate_data with valid checksum and checksum algorithm supplied.""" + """Check _store_and_validate_data stores a data object when a valid checksum and checksum + algorithm is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -325,7 +319,7 @@ def test_store_and_validate_data_with_correct_checksums(pids, store): def test_store_and_validate_data_with_incorrect_checksum(pids, store): - """Check _store_and_validate_data fails when a bad checksum supplied.""" + """Check _store_and_validate_data does not store data objects when a bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -345,8 +339,7 @@ def test_store_data_only_cid(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_data_only(path) - object_metadata_id = object_metadata.cid - assert object_metadata_id == pids[pid][store.algorithm] + assert object_metadata.cid == pids[pid][store.algorithm] def test_store_data_only_file_size(pids, store): @@ -355,8 +348,7 @@ def test_store_data_only_file_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_data_only(path) - object_size = object_metadata.obj_size - assert object_size == pids[pid]["file_size_bytes"] + assert object_metadata.obj_size == pids[pid]["file_size_bytes"] def test_store_data_only_hex_digests(pids, store): @@ -365,12 +357,11 @@ def test_store_data_only_hex_digests(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_data_only(path) - object_metadata_hex_digests = object_metadata.hex_digests - assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] - assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] - assert object_metadata_hex_digests.get("sha256") == pids[pid]["sha256"] - assert object_metadata_hex_digests.get("sha384") == pids[pid]["sha384"] - assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] + assert object_metadata.hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata.hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata.hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata.hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata.hex_digests.get("sha512") == pids[pid]["sha512"] def test_move_and_get_checksums_id(pids, store): @@ -821,17 +812,30 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( def test_find_object(pids, store): - """Test find_object returns the correct content identifier (cid).""" + """Test _find_object returns the correct content.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) obj_info_dict = store._find_object(pid) - assert obj_info_dict.get("cid") == object_metadata.hex_digests.get("sha256") + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert obj_info_dict["sysmeta_path"] == "Does not exist." def test_find_object_refs_exist_but_obj_not_found(pids, store): - """Test find_object throws exception when refs file exist but the object does not.""" + """Test _find_object throws exception when refs file exist but the object does not.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -846,8 +850,8 @@ def test_find_object_refs_exist_but_obj_not_found(pids, store): def test_find_object_cid_refs_not_found(pids, store): - """Test find_object throws exception when pid refs file is found with a cid - but the cid does not exist.""" + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not exist.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -865,7 +869,7 @@ def test_find_object_cid_refs_not_found(pids, store): def test_find_object_cid_refs_does_not_contain_pid(pids, store): - """Test find_object throws exception when pid refs file is found with a cid + """Test _find_object throws exception when pid refs file is found (and contains a cid) but the cid refs file does not contain the pid.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -883,19 +887,19 @@ def test_find_object_cid_refs_does_not_contain_pid(pids, store): def test_find_object_pid_refs_not_found(store): - """Test find object throws exception when object doesn't exist.""" + """Test _find_object throws exception when a pid refs file does not exist.""" with pytest.raises(PidRefsDoesNotExist): store._find_object("dou.test.1") def test_find_object_pid_none(store): - """Test find object throws exception when pid is None.""" + """Test _find_object throws exception when pid is None.""" with pytest.raises(ValueError): store._find_object(None) def test_find_object_pid_empty(store): - """Test find object throws exception when pid is empty.""" + """Test _find_object throws exception when pid is empty.""" with pytest.raises(ValueError): store._find_object("") From 1c7b988b70e01ac5ee6378d7c7e88b0671ed7420 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Sep 2024 15:27:49 -0700 Subject: [PATCH 329/420] Add new custom exception 'StoreObjectForPidAlreadyInProgress', refactor 'store_object' to reject duplicate calls to store objects for a pid already in progress and update pytest --- src/hashstore/filehashstore.py | 49 ++++++++++++++--------- src/hashstore/filehashstore_exceptions.py | 10 +++++ tests/test_filehashstore_interface.py | 5 ++- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ad0bb3cf..97ba1461 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -30,6 +30,7 @@ PidRefsFileNotFound, RefsFileExistsButCidObjMissing, UnsupportedAlgorithm, + StoreObjectForPidAlreadyInProgress, ) @@ -510,23 +511,24 @@ def store_object( sync_begin_debug_msg = ( f"FileHashStore - store_object: Adding pid ({pid}) to locked list." ) - sync_wait_msg = ( - f"FileHashStore - store_object: Pid ({pid}) is locked. Waiting." + err_msg = ( + f"FileHashStore - store_object: Duplicate object request encountered for pid: " + f"{pid}" + ". Already in progress." ) if self.use_multiprocessing: with self.object_condition_mp: # Wait for the pid to release if it's in use - while pid in self.object_locked_pids_mp: - logging.debug(sync_wait_msg) - self.object_condition_mp.wait() + if pid in self.object_locked_pids_mp: + logging.error(err_msg) + raise StoreObjectForPidAlreadyInProgress(err_msg) # Modify object_locked_pids consecutively logging.debug(sync_begin_debug_msg) self.object_locked_pids_mp.append(pid) else: with self.object_condition: - while pid in self.object_locked_pids: - logging.debug(sync_wait_msg) - self.object_condition.wait() + if pid in self.object_locked_pids: + logging.error(err_msg) + raise StoreObjectForPidAlreadyInProgress(err_msg) logging.debug(sync_begin_debug_msg) self.object_locked_pids.append(pid) try: @@ -566,17 +568,8 @@ def store_object( f"FileHashStore - store_object: Releasing pid ({pid})" + " from locked list" ) - if self.use_multiprocessing: - with self.object_condition_mp: - logging.debug(end_sync_debug_msg) - self.object_locked_pids_mp.remove(pid) - self.object_condition_mp.notify() - else: - # Release pid - with self.object_condition: - logging.debug(end_sync_debug_msg) - self.object_locked_pids.remove(pid) - self.object_condition.notify() + self.release_object_locked_pids(pid) + logging.debug(end_sync_debug_msg) return object_metadata @@ -2594,6 +2587,24 @@ def _get_store_path(self, entity): f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) + # Synchronization Methods + + def release_object_locked_pids(self, pid): + """Remove the given persistent identifier from 'object_locked_pids' and notify other + waiting threads or processes. + + :param str pid: Persistent or authority-based identifier + """ + if self.use_multiprocessing: + with self.object_condition_mp: + self.object_locked_pids_mp.remove(pid) + self.object_condition_mp.notify() + else: + # Release pid + with self.object_condition: + self.object_locked_pids.remove(pid) + self.object_condition.notify() + @staticmethod def _get_file_paths(directory): """Get the file paths of a given directory if it exists diff --git a/src/hashstore/filehashstore_exceptions.py b/src/hashstore/filehashstore_exceptions.py index 7556c3f4..8da80fb2 100644 --- a/src/hashstore/filehashstore_exceptions.py +++ b/src/hashstore/filehashstore_exceptions.py @@ -1,6 +1,16 @@ """FileHashStore custom exception module.""" +class StoreObjectForPidAlreadyInProgress(Exception): + """Custom exception thrown when called to store a data object for a pid that is already + progress. A pid can only ever reference one data object/content identifier so duplicate + requests are rejected immediately.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + class CidRefsContentError(Exception): """Custom exception thrown when verifying reference files and a cid refs file does not have a pid that is expected to be found.""" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 71ec548e..8bf35ab8 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -501,7 +501,10 @@ def store_object_wrapper(obj_pid, obj_path): store.store_object(obj_pid, obj_path) # Call store_object inside the thread # pylint: disable=W0718 except Exception as e: - assert type(e).__name__ == "HashStoreRefsAlreadyExists" + assert ( + type(e).__name__ == "HashStoreRefsAlreadyExists" + or type(e).__name__ == "StoreObjectForPidAlreadyInProgress" + ) thread1 = Thread(target=store_object_wrapper, args=(pid, path)) thread2 = Thread(target=store_object_wrapper, args=(pid, path)) From 4a18ea8ee3727444df5247e826d9942c6e9cd647 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Sep 2024 15:47:32 -0700 Subject: [PATCH 330/420] Refactor 'tag_object' scenario when both refs files exist to also include additional debug info if there is an issue --- src/hashstore/filehashstore.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 97ba1461..954f6add 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -658,19 +658,28 @@ def tag_object(self, pid, cid): self._create_path(Path(os.path.dirname(cid_refs_path))) if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): - self._verify_hashstore_references( - pid, - cid, - pid_refs_path, - cid_refs_path, - "Refs file already exists, verifying.", - ) - error_msg = ( + # If both reference files exist, we confirm that reference files are where they + # are expected to be and throw an exception to inform the client that everything + # is in place - and include other issues for context + err_msg = ( f"FileHashStore - tag_object: Object with cid: {cid}" - + f" already exists and is tagged with pid: {pid}" + + f" already exists and is tagged with pid: {pid}." ) - logging.error(error_msg) - raise HashStoreRefsAlreadyExists(error_msg) + try: + self._verify_hashstore_references( + pid, + cid, + pid_refs_path, + cid_refs_path, + "Refs file already exists, verifying.", + ) + logging.error(err_msg) + raise HashStoreRefsAlreadyExists(err_msg) + except Exception as e: + rev_msg = err_msg + " " + str(e) + logging.error(rev_msg) + raise HashStoreRefsAlreadyExists(err_msg) + elif os.path.exists(pid_refs_path) and not os.path.exists(cid_refs_path): debug_msg = ( f"FileHashStore - tag_object: pid refs file exists ({pid_refs_path})" From aba0b74e49046762a6da01954779f75ac11566d9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Sep 2024 15:55:51 -0700 Subject: [PATCH 331/420] Refactor 'tag_object' scenario when pid refs exists to throw exception immediately, rename that exception from 'PidAlreadyExistsError' to 'PidRefsAlreadyExistsError' and update pytests --- src/hashstore/filehashstore.py | 57 ++++------------------- src/hashstore/filehashstore_exceptions.py | 2 +- tests/test_filehashstore_references.py | 23 +-------- 3 files changed, 11 insertions(+), 71 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 954f6add..ded2225c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -23,7 +23,7 @@ HashStoreRefsAlreadyExists, NonMatchingChecksum, NonMatchingObjSize, - PidAlreadyExistsError, + PidRefsAlreadyExistsError, PidNotFoundInCidRefsFile, PidRefsContentError, PidRefsDoesNotExist, @@ -681,56 +681,15 @@ def tag_object(self, pid, cid): raise HashStoreRefsAlreadyExists(err_msg) elif os.path.exists(pid_refs_path) and not os.path.exists(cid_refs_path): - debug_msg = ( - f"FileHashStore - tag_object: pid refs file exists ({pid_refs_path})" - + f" for pid: {pid}, but cid refs file doesn't at: {cid_refs_path}" - + f" for cid: {cid}" + # If pid refs exists, the pid has already been claimed and cannot be tagged we + # throw an exception immediately + error_msg = ( + f"FileHashStore - tag_object: Pid refs file already exists for pid: {pid}." + + " A pid can only reference one cid. " ) - logging.debug(debug_msg) - # A pid reference file can only contain and reference one cid - # First, confirm that the expected cid refs file exists by getting the cid - with open(pid_refs_path, "r", encoding="utf8") as pid_ref_file: - pid_refs_cid = pid_ref_file.read() + logging.error(error_msg) + raise PidRefsAlreadyExistsError(error_msg) - if self._is_string_in_refs_file(cid, pid_refs_path): - # The pid correctly references the given cid, but the cid refs file is missing - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - shutil.move(cid_tmp_file_path, cid_refs_path) - self._verify_hashstore_references( - pid, - cid, - pid_refs_path, - cid_refs_path, - "Created missing cid refs file", - ) - info_msg = ( - f"FileHashStore - tag_object: pid refs file exists for pid: {pid}" - + f", with the expected cid: {cid} - but cid refs file is missing." - + " Cid refs file created, tagged and verified." - ) - logging.info(info_msg) - return True - else: - # Check if the retrieved cid refs file exists and pid is referenced - retrieved_cid_refs_path = self._get_hashstore_cid_refs_path( - pid_refs_cid - ) - if os.path.exists( - retrieved_cid_refs_path - ) and self._is_string_in_refs_file(pid, retrieved_cid_refs_path): - # Throw exception, this pid is accounted for - error_msg = ( - "FileHashStore - tag_object: Pid refs file exists with valid pid" - + f" and cid reference files for pid: {pid} with cid: {cid}." - ) - logging.error(error_msg) - raise PidAlreadyExistsError(error_msg) - else: - debug_msg = ( - f"FileHashStore - tag_object: Orphan pid refs file found for {pid}." - + f" Cid ({cid}) reference file does not contain the pid. Proceeding." - ) - logging.debug(debug_msg) elif not os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): debug_msg = ( f"FileHashStore - tag_object: pid refs file does not exist for pid {pid}" diff --git a/src/hashstore/filehashstore_exceptions.py b/src/hashstore/filehashstore_exceptions.py index 8da80fb2..65e52139 100644 --- a/src/hashstore/filehashstore_exceptions.py +++ b/src/hashstore/filehashstore_exceptions.py @@ -55,7 +55,7 @@ def __init__(self, message, errors=None): self.errors = errors -class PidAlreadyExistsError(Exception): +class PidRefsAlreadyExistsError(Exception): """Custom exception thrown when a client calls 'tag_object' and the pid that is being tagged is already accounted for (has a pid refs file and is found in the cid refs file).""" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index fa117d70..0c9821d3 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -8,7 +8,7 @@ CidRefsContentError, CidRefsFileNotFound, HashStoreRefsAlreadyExists, - PidAlreadyExistsError, + PidRefsAlreadyExistsError, PidRefsContentError, PidRefsFileNotFound, ) @@ -97,25 +97,6 @@ def test_tag_object_pid_refs_found_cid_refs_found(pids, store): def test_tag_object_pid_refs_found_cid_refs_not_found(store): - """Test that tag_object creates a missing cid refs file when called to tag a cid - with a pid whose associated pid refs file contains the given cid.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - # Manually delete the cid refs file, creating an orphaned pid - cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - os.remove(cid_ref_abs_path) - assert store._count("cid") == 0 - - store.tag_object(pid, cid) - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - -def test_tag_object_pid_refs_found_cid_refs_not_found_different_cid_retrieved(store): """Test that tag_object throws an exception when pid refs file exists, contains a different cid, and is correctly referenced in the associated cid refs file""" test_dir = "tests/testdata/" @@ -123,7 +104,7 @@ def test_tag_object_pid_refs_found_cid_refs_not_found_different_cid_retrieved(st path = test_dir + pid.replace("/", "_") _object_metadata = store.store_object(pid, path) - with pytest.raises(PidAlreadyExistsError): + with pytest.raises(PidRefsAlreadyExistsError): store.tag_object(pid, "another_cid_value_that_is_not_found") From ddc179303f671469db3f8cb5f3ceb12a72ddbaaa Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Sep 2024 16:15:52 -0700 Subject: [PATCH 332/420] Remove return value from 'tag_object' in hashstore interface, refactor 'tag_object', update pytests and add TODO item --- src/hashstore/filehashstore.py | 3 +-- src/hashstore/hashstore.py | 2 -- tests/test_filehashstore_references.py | 7 ++++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ded2225c..1fb47fd7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -714,7 +714,7 @@ def tag_object(self, pid, cid): cid, pid, ) - return True + return # Move both files after checking the existing status of refs files pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") @@ -730,7 +730,6 @@ def tag_object(self, pid, cid): cid, pid, ) - return True finally: # Release cid end_sync_debug_msg = ( diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 13119e09..f09974ac 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -72,8 +72,6 @@ def tag_object(self, pid, cid): :param str pid: Authority-based or persistent identifier of the object. :param str cid: Content identifier of the object. - - :return: bool - `True` upon successful tagging. """ raise NotImplementedError() diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 0c9821d3..d1178ba6 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -17,13 +17,14 @@ def test_tag_object(pids, store): - """Test tag_object returns true boolean when successful.""" + """Test tag_object does not throw exception when successful.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - object_tagged = store.tag_object(pid, object_metadata.cid) - assert object_tagged + store.tag_object(pid, object_metadata.cid) + assert store._count("pid") == 3 + assert store._count("cid") == 3 def test_tag_object_pid_refs_file_exists(pids, store): From 4bc31cf22408bb86cffa36903813191c135ba61f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Sep 2024 16:37:26 -0700 Subject: [PATCH 333/420] Rename synchronization variables for accuracy --- src/hashstore/filehashstore.py | 207 +++++++++++++++++---------------- 1 file changed, 108 insertions(+), 99 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1fb47fd7..ddfa957a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -83,30 +83,38 @@ def __init__(self, properties=None): self.use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" if self.use_multiprocessing == "True": # Create multiprocessing synchronization variables - self.object_lock_mp = multiprocessing.Lock() - self.object_condition_mp = multiprocessing.Condition(self.object_lock_mp) + # Synchronization values for object locked pids + self.object_pid_lock_mp = multiprocessing.Lock() + self.object_pid_condition_mp = multiprocessing.Condition( + self.object_pid_lock_mp + ) self.object_locked_pids_mp = multiprocessing.Manager().list() + # Synchronization values for object locked cids + self.object_cid_lock_mp = multiprocessing.Lock() + self.object_cid_condition_mp = multiprocessing.Condition( + self.object_cid_lock_mp + ) + self.object_locked_cids_mp = multiprocessing.Manager().list() + # Synchronization values for metadata locked documents self.metadata_lock_mp = multiprocessing.Lock() self.metadata_condition_mp = multiprocessing.Condition( self.metadata_lock_mp ) self.metadata_locked_docs_mp = multiprocessing.Manager().list() - self.reference_lock_mp = multiprocessing.Lock() - self.reference_condition_mp = multiprocessing.Condition( - self.reference_lock_mp - ) - self.reference_locked_cids_mp = multiprocessing.Manager().list() else: # Create threading synchronization variables - self.object_lock = threading.Lock() - self.object_condition = threading.Condition(self.object_lock) - self.object_locked_pids = [] - self.metadata_lock = threading.Lock() - self.metadata_condition = threading.Condition(self.metadata_lock) - self.metadata_locked_docs = [] - self.reference_lock = threading.Lock() - self.reference_condition = threading.Condition(self.reference_lock) - self.reference_locked_cids = [] + # Synchronization values for object locked pids + self.object_pid_lock_th = threading.Lock() + self.object_pid_condition_th = threading.Condition(self.object_pid_lock_th) + self.object_locked_pids_th = [] + # Synchronization values for object locked cids + self.object_cid_lock_th = threading.Lock() + self.object_cid_condition_th = threading.Condition(self.object_cid_lock_th) + self.object_locked_cids_th = [] + # Synchronization values for metadata locked documents + self.metadata_lock_th = threading.Lock() + self.metadata_condition_th = threading.Condition(self.metadata_lock_th) + self.metadata_locked_docs_th = [] # Now check properties if properties: # Validate properties against existing configuration if present @@ -516,7 +524,7 @@ def store_object( f"{pid}" + ". Already in progress." ) if self.use_multiprocessing: - with self.object_condition_mp: + with self.object_pid_condition_mp: # Wait for the pid to release if it's in use if pid in self.object_locked_pids_mp: logging.error(err_msg) @@ -525,12 +533,12 @@ def store_object( logging.debug(sync_begin_debug_msg) self.object_locked_pids_mp.append(pid) else: - with self.object_condition: - if pid in self.object_locked_pids: + with self.object_pid_condition_th: + if pid in self.object_locked_pids_th: logging.error(err_msg) raise StoreObjectForPidAlreadyInProgress(err_msg) logging.debug(sync_begin_debug_msg) - self.object_locked_pids.append(pid) + self.object_locked_pids_th.append(pid) try: logging.debug( "FileHashStore - store_object: Attempting to store object for pid: %s", @@ -632,22 +640,23 @@ def tag_object(self, pid, cid): f"FileHashStore - tag_object: Adding cid ({pid}) to locked list." ) sync_wait_msg = f"FileHashStore - tag_object: Cid ({cid}) is locked. Waiting." + # TODO: The pid should also be locked to ensure thread safety if self.use_multiprocessing: - with self.reference_condition_mp: + with self.object_cid_condition_mp: # Wait for the cid to release if it's being tagged - while cid in self.reference_locked_cids_mp: + while cid in self.object_locked_cids_mp: logging.debug(sync_wait_msg) - self.reference_condition_mp.wait() + self.object_cid_condition_mp.wait() # Modify reference_locked_cids consecutively logging.debug(sync_begin_debug_msg) - self.reference_locked_cids_mp.append(cid) + self.object_locked_cids_mp.append(cid) else: - with self.reference_condition: - while cid in self.reference_locked_cids: + with self.object_cid_condition_th: + while cid in self.object_locked_cids_th: logging.debug(sync_wait_msg) - self.reference_condition.wait() + self.object_cid_condition_th.wait() logging.debug(sync_begin_debug_msg) - self.reference_locked_cids.append(cid) + self.object_locked_cids_th.append(cid) try: # Prepare files and paths tmp_root_path = self._get_store_path("refs") / "tmp" @@ -737,15 +746,15 @@ def tag_object(self, pid, cid): + " reference_locked_cids." ) if self.use_multiprocessing: - with self.reference_condition_mp: + with self.object_cid_condition_mp: logging.debug(end_sync_debug_msg) - self.reference_locked_cids_mp.remove(cid) - self.reference_condition_mp.notify() + self.object_locked_cids_mp.remove(cid) + self.object_cid_condition_mp.notify() else: - with self.reference_condition: + with self.object_cid_condition_th: logging.debug(end_sync_debug_msg) - self.reference_locked_cids.remove(cid) - self.reference_condition.notify() + self.object_locked_cids_th.remove(cid) + self.object_cid_condition_th.notify() def store_metadata(self, pid, metadata, format_id=None): logging.debug( @@ -775,12 +784,12 @@ def store_metadata(self, pid, metadata, format_id=None): logging.debug(sync_begin_debug_msg) self.metadata_locked_docs_mp.append(pid_doc) else: - with self.metadata_condition: - while pid_doc in self.metadata_locked_docs: + with self.metadata_condition_th: + while pid_doc in self.metadata_locked_docs_th: logging.debug(sync_wait_msg) - self.metadata_condition.wait() + self.metadata_condition_th.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_docs.append(pid_doc) + self.metadata_locked_docs_th.append(pid_doc) try: metadata_cid = self._put_metadata(metadata, pid, pid_doc) @@ -802,10 +811,10 @@ def store_metadata(self, pid, metadata, format_id=None): self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: - with self.metadata_condition: + with self.metadata_condition_th: logging.debug(end_sync_debug_msg) - self.metadata_locked_docs.remove(pid_doc) - self.metadata_condition.notify() + self.metadata_locked_docs_th.remove(pid_doc) + self.metadata_condition_th.notify() def retrieve_object(self, pid): logging.debug( @@ -885,21 +894,21 @@ def delete_object(self, pid): f"FileHashStore - delete_object: Pid ({pid}) is locked. Waiting." ) if self.use_multiprocessing: - with self.object_condition_mp: + with self.object_pid_condition_mp: # Wait for the pid to release if it's in use while pid in self.object_locked_pids_mp: logging.debug(sync_wait_msg) - self.object_condition_mp.wait() + self.object_pid_condition_mp.wait() # Modify object_locked_pids consecutively logging.debug(sync_begin_debug_msg) self.object_locked_pids_mp.append(pid) else: - with self.object_condition: - while pid in self.object_locked_pids: + with self.object_pid_condition_th: + while pid in self.object_locked_pids_th: logging.debug(sync_wait_msg) - self.object_condition.wait() + self.object_pid_condition_th.wait() logging.debug(sync_begin_debug_msg) - self.object_locked_pids.append(pid) + self.object_locked_pids_th.append(pid) try: # Before we begin deletion process, we look for the `cid` by calling @@ -920,21 +929,21 @@ def delete_object(self, pid): + " Waiting." ) if self.use_multiprocessing: - with self.reference_condition_mp: + with self.object_cid_condition_mp: # Wait for the cid to release if it's in use - while cid in self.reference_locked_cids_mp: + while cid in self.object_locked_cids_mp: logging.debug(sync_wait_msg) - self.reference_condition_mp.wait() + self.object_cid_condition_mp.wait() # Modify reference_locked_cids consecutively logging.debug(sync_begin_debug_msg) - self.reference_locked_cids_mp.append(cid) + self.object_locked_cids_mp.append(cid) else: - with self.reference_condition: - while cid in self.reference_locked_cids: + with self.object_cid_condition_th: + while cid in self.object_locked_cids_th: logging.debug(sync_wait_msg) - self.reference_condition.wait() + self.object_cid_condition_th.wait() logging.debug(sync_begin_debug_msg) - self.reference_locked_cids.append(cid) + self.object_locked_cids_th.append(cid) try: cid_ref_abs_path = object_info_dict.get("cid_refs_path") @@ -980,15 +989,15 @@ def delete_object(self, pid): + " from locked list" ) if self.use_multiprocessing: - with self.reference_condition_mp: + with self.object_cid_condition_mp: logging.debug(end_sync_debug_msg) - self.reference_locked_cids_mp.remove(cid) - self.reference_condition_mp.notify() + self.object_locked_cids_mp.remove(cid) + self.object_cid_condition_mp.notify() else: - with self.reference_condition: + with self.object_cid_condition_th: logging.debug(end_sync_debug_msg) - self.reference_locked_cids.remove(cid) - self.reference_condition.notify() + self.object_locked_cids_th.remove(cid) + self.object_cid_condition_th.notify() except PidRefsDoesNotExist: warn_msg = ( @@ -1056,16 +1065,16 @@ def delete_object(self, pid): + " from locked list" ) if self.use_multiprocessing: - with self.object_condition_mp: + with self.object_pid_condition_mp: logging.debug(end_sync_debug_msg) self.object_locked_pids_mp.remove(pid) - self.object_condition_mp.notify() + self.object_pid_condition_mp.notify() else: # Release pid - with self.object_condition: + with self.object_pid_condition_th: logging.debug(end_sync_debug_msg) - self.object_locked_pids.remove(pid) - self.object_condition.notify() + self.object_locked_pids_th.remove(pid) + self.object_pid_condition_th.notify() def delete_metadata(self, pid, format_id=None): logging.debug( @@ -1107,12 +1116,12 @@ def delete_metadata(self, pid, format_id=None): logging.debug(sync_begin_debug_msg) self.metadata_locked_docs_mp.append(pid_doc) else: - with self.metadata_condition: - while pid in self.metadata_locked_docs: + with self.metadata_condition_th: + while pid in self.metadata_locked_docs_th: logging.debug(sync_wait_msg) - self.metadata_condition.wait() + self.metadata_condition_th.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_docs.append(pid_doc) + self.metadata_locked_docs_th.append(pid_doc) try: # Mark metadata doc for deletion objects_to_delete.append(self._rename_path_for_deletion(path)) @@ -1129,10 +1138,10 @@ def delete_metadata(self, pid, format_id=None): self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: - with self.metadata_condition: + with self.metadata_condition_th: logging.debug(end_sync_debug_msg) - self.metadata_locked_docs.remove(pid_doc) - self.metadata_condition.notify() + self.metadata_locked_docs_th.remove(pid_doc) + self.metadata_condition_th.notify() # Delete metadata objects for obj in objects_to_delete: @@ -1164,12 +1173,12 @@ def delete_metadata(self, pid, format_id=None): logging.debug(sync_begin_debug_msg) self.metadata_locked_docs_mp.append(pid_doc) else: - with self.metadata_condition: - while pid in self.metadata_locked_docs: + with self.metadata_condition_th: + while pid in self.metadata_locked_docs_th: logging.debug(sync_wait_msg) - self.metadata_condition.wait() + self.metadata_condition_th.wait() logging.debug(sync_begin_debug_msg) - self.metadata_locked_docs.append(pid_doc) + self.metadata_locked_docs_th.append(pid_doc) try: full_path_without_directory = ( self.metadata + "/" + rel_path + "/" + pid_doc @@ -1193,10 +1202,10 @@ def delete_metadata(self, pid, format_id=None): self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: - with self.metadata_condition: + with self.metadata_condition_th: logging.debug(end_sync_debug_msg) - self.metadata_locked_docs.remove(pid_doc) - self.metadata_condition.notify() + self.metadata_locked_docs_th.remove(pid_doc) + self.metadata_condition_th.notify() def get_hex_digest(self, pid, algorithm): logging.debug( @@ -2064,21 +2073,21 @@ def _delete_object_only(self, cid): f"FileHashStore - delete_object: Cid ({cid}) is locked. Waiting." ) if self.use_multiprocessing: - with self.reference_condition_mp: + with self.object_cid_condition_mp: # Wait for the cid to release if it's in use - while cid in self.reference_locked_cids_mp: + while cid in self.object_locked_cids_mp: logging.debug(sync_wait_msg) - self.reference_condition_mp.wait() + self.object_cid_condition_mp.wait() # Modify reference_locked_cids consecutively logging.debug(sync_begin_debug_msg) - self.reference_locked_cids_mp.append(cid) + self.object_locked_cids_mp.append(cid) else: - with self.reference_condition: - while cid in self.reference_locked_cids: + with self.object_cid_condition_th: + while cid in self.object_locked_cids_th: logging.debug(sync_wait_msg) - self.reference_condition.wait() + self.object_cid_condition_th.wait() logging.debug(sync_begin_debug_msg) - self.reference_locked_cids.append(cid) + self.object_locked_cids_th.append(cid) try: self._delete("objects", cid) @@ -2089,15 +2098,15 @@ def _delete_object_only(self, cid): + " from locked list" ) if self.use_multiprocessing: - with self.reference_condition_mp: + with self.object_cid_condition_mp: logging.debug(end_sync_debug_msg) - self.reference_locked_cids_mp.remove(cid) - self.reference_condition_mp.notify() + self.object_locked_cids_mp.remove(cid) + self.object_cid_condition_mp.notify() else: - with self.reference_condition: + with self.object_cid_condition_th: logging.debug(end_sync_debug_msg) - self.reference_locked_cids.remove(cid) - self.reference_condition.notify() + self.object_locked_cids_th.remove(cid) + self.object_cid_condition_th.notify() @staticmethod def _check_arg_data(data): @@ -2563,14 +2572,14 @@ def release_object_locked_pids(self, pid): :param str pid: Persistent or authority-based identifier """ if self.use_multiprocessing: - with self.object_condition_mp: + with self.object_pid_condition_mp: self.object_locked_pids_mp.remove(pid) - self.object_condition_mp.notify() + self.object_pid_condition_mp.notify() else: # Release pid - with self.object_condition: - self.object_locked_pids.remove(pid) - self.object_condition.notify() + with self.object_pid_condition_th: + self.object_locked_pids_th.remove(pid) + self.object_pid_condition_th.notify() @staticmethod def _get_file_paths(directory): From 69ed1d2083a64859954b889b3ea63ac961f9dbeb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Sep 2024 16:47:44 -0700 Subject: [PATCH 334/420] Refactor 'tag_object' by extracting method '_synchronize_referenced_locked_cids' and rename existing sync method with '_' to be private --- src/hashstore/filehashstore.py | 53 +++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ddfa957a..ddbbf80e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -636,27 +636,9 @@ def tag_object(self, pid, cid): self._check_string(pid, "pid") self._check_string(cid, "cid") - sync_begin_debug_msg = ( - f"FileHashStore - tag_object: Adding cid ({pid}) to locked list." - ) - sync_wait_msg = f"FileHashStore - tag_object: Cid ({cid}) is locked. Waiting." # TODO: The pid should also be locked to ensure thread safety - if self.use_multiprocessing: - with self.object_cid_condition_mp: - # Wait for the cid to release if it's being tagged - while cid in self.object_locked_cids_mp: - logging.debug(sync_wait_msg) - self.object_cid_condition_mp.wait() - # Modify reference_locked_cids consecutively - logging.debug(sync_begin_debug_msg) - self.object_locked_cids_mp.append(cid) - else: - with self.object_cid_condition_th: - while cid in self.object_locked_cids_th: - logging.debug(sync_wait_msg) - self.object_cid_condition_th.wait() - logging.debug(sync_begin_debug_msg) - self.object_locked_cids_th.append(cid) + self.synchronize_referenced_locked_cids(cid) + try: # Prepare files and paths tmp_root_path = self._get_store_path("refs") / "tmp" @@ -2581,6 +2563,37 @@ def release_object_locked_pids(self, pid): self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() + def synchronize_referenced_locked_cids(self, cid): + """Multiple threads may access a data object via its 'cid' or the respective 'cid + reference file' (which contains a list of 'pid's that reference a 'cid') and this needs + to be coordinated.""" + if self.use_multiprocessing: + with self.object_cid_condition_mp: + # Wait for the cid to release if it's being tagged + while cid in self.object_locked_cids_mp: + logging.debug( + f"synchronize_referenced_locked_cids: Cid ({cid}) is locked. Waiting." + ) + self.object_cid_condition_mp.wait() + # Modify reference_locked_cids consecutively + self.object_locked_cids_mp.append(cid) + logging.debug( + f"synchronize_referenced_locked_cids: Synchronizing object_locked_cids_mp for" + + f" cid: {cid}" + ) + else: + with self.object_cid_condition_th: + while cid in self.object_locked_cids_th: + logging.debug( + f"synchronize_referenced_locked_cids: Cid ({cid}) is locked. Waiting." + ) + self.object_cid_condition_th.wait() + self.object_locked_cids_th.append(cid) + logging.debug( + f"synchronize_referenced_locked_cids: Synchronizing object_locked_cids_th for" + + f" cid: {cid}" + ) + @staticmethod def _get_file_paths(directory): """Get the file paths of a given directory if it exists From 414657e6a428fca7c7a803cf28233e6fa30090d8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 09:04:22 -0700 Subject: [PATCH 335/420] Add new synchronization variables for pids to ensure thread safety for 'tag_object' --- src/hashstore/filehashstore.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ddbbf80e..46e83a10 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -101,6 +101,12 @@ def __init__(self, properties=None): self.metadata_lock_mp ) self.metadata_locked_docs_mp = multiprocessing.Manager().list() + # Synchronization values for reference locked pids + self.reference_pid_lock_mp = multiprocessing.Lock() + self.reference_pid_condition_mp = multiprocessing.Condition( + self.reference_pid_lock_mp + ) + self.reference_locked_docs_mp = multiprocessing.Manager().list() else: # Create threading synchronization variables # Synchronization values for object locked pids @@ -115,6 +121,10 @@ def __init__(self, properties=None): self.metadata_lock_th = threading.Lock() self.metadata_condition_th = threading.Condition(self.metadata_lock_th) self.metadata_locked_docs_th = [] + # Synchronization values for reference locked pids + self.reference_pid_lock_th = threading.Lock() + self.reference_pid_condition_th = threading.Condition(self.metadata_lock_th) + self.reference_locked_docs_th = [] # Now check properties if properties: # Validate properties against existing configuration if present @@ -576,7 +586,7 @@ def store_object( f"FileHashStore - store_object: Releasing pid ({pid})" + " from locked list" ) - self.release_object_locked_pids(pid) + self._release_object_locked_pids(pid) logging.debug(end_sync_debug_msg) return object_metadata @@ -637,7 +647,7 @@ def tag_object(self, pid, cid): self._check_string(cid, "cid") # TODO: The pid should also be locked to ensure thread safety - self.synchronize_referenced_locked_cids(cid) + self._synchronize_referenced_locked_cids(cid) try: # Prepare files and paths @@ -2547,7 +2557,7 @@ def _get_store_path(self, entity): # Synchronization Methods - def release_object_locked_pids(self, pid): + def _release_object_locked_pids(self, pid): """Remove the given persistent identifier from 'object_locked_pids' and notify other waiting threads or processes. @@ -2563,7 +2573,7 @@ def release_object_locked_pids(self, pid): self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() - def synchronize_referenced_locked_cids(self, cid): + def _synchronize_referenced_locked_cids(self, cid): """Multiple threads may access a data object via its 'cid' or the respective 'cid reference file' (which contains a list of 'pid's that reference a 'cid') and this needs to be coordinated.""" From 66a003574c0279683d498071c334994231d87ad3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 09:44:28 -0700 Subject: [PATCH 336/420] Rename inaccurate sync variables, add missing docstring params and add new sync methods '_synchronize_referenced_locked_pids' & '_release_reference_locked_pids' --- src/hashstore/filehashstore.py | 60 +++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 46e83a10..1a643c09 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -106,7 +106,7 @@ def __init__(self, properties=None): self.reference_pid_condition_mp = multiprocessing.Condition( self.reference_pid_lock_mp ) - self.reference_locked_docs_mp = multiprocessing.Manager().list() + self.reference_locked_pids_mp = multiprocessing.Manager().list() else: # Create threading synchronization variables # Synchronization values for object locked pids @@ -124,7 +124,7 @@ def __init__(self, properties=None): # Synchronization values for reference locked pids self.reference_pid_lock_th = threading.Lock() self.reference_pid_condition_th = threading.Condition(self.metadata_lock_th) - self.reference_locked_docs_th = [] + self.reference_locked_pids_th = [] # Now check properties if properties: # Validate properties against existing configuration if present @@ -2575,8 +2575,11 @@ def _release_object_locked_pids(self, pid): def _synchronize_referenced_locked_cids(self, cid): """Multiple threads may access a data object via its 'cid' or the respective 'cid - reference file' (which contains a list of 'pid's that reference a 'cid') and this needs - to be coordinated.""" + reference file' (which contains a list of 'pid's that reference a 'cid') and this needs + to be coordinated. + + :param str cid: Content identifier + """ if self.use_multiprocessing: with self.object_cid_condition_mp: # Wait for the cid to release if it's being tagged @@ -2604,6 +2607,55 @@ def _synchronize_referenced_locked_cids(self, cid): + f" cid: {cid}" ) + def _synchronize_referenced_locked_pids(self, pid): + """Multiple threads may interact with a pid (to tag, untag, delete) and these actions + must be coordinated to prevent unexpected behaviour/race conditions that cause chaos. + + :param str pid: Persistent or authority-based identifier + """ + if self.use_multiprocessing: + with self.reference_pid_condition_mp: + # Wait for the pid to release if it's in use + while pid in self.reference_locked_pids_mp: + logging.debug( + f"_synchronize_referenced_locked_pids: Pid ({pid}) is locked. Waiting." + ) + self.reference_pid_condition_mp.wait() + # Modify reference_locked_pids consecutively + self.reference_locked_pids_mp.append(pid) + logging.debug( + f"_synchronize_referenced_locked_pids: Synchronizing reference_locked_pids_mp" + + f" for pid: {pid}" + ) + else: + with self.reference_pid_condition_th: + while pid in self.reference_locked_pids_th: + logging.debug( + f"_synchronize_referenced_locked_pids: Pid ({pid}) is locked. Waiting." + ) + self.reference_pid_condition_th.wait() + self.reference_locked_pids_th.append(pid) + logging.debug( + f"_synchronize_referenced_locked_pids: Synchronizing reference_locked_pids_th" + + f" for pid: {pid}" + ) + + def _release_reference_locked_pids(self, pid): + """Remove the given persistent identifier from 'reference_locked_pids_' and notify other + waiting threads or processes. + + :param str pid: Persistent or authority-based identifier + """ + if self.use_multiprocessing: + with self.reference_pid_condition_mp: + self.reference_locked_pids_mp.remove(pid) + self.reference_pid_condition_mp.notify() + else: + # Release pid + with self.reference_pid_condition_th: + self.reference_locked_pids_th.remove(pid) + self.reference_pid_condition_th.notify() + @staticmethod def _get_file_paths(directory): """Get the file paths of a given directory if it exists From 723960062cd00e90623206dc37908ecb8eaa6c1b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 09:49:00 -0700 Subject: [PATCH 337/420] Add pid synchronization to 'tag_object' and rename '_synchronize_reference_locked_cids' to '_synchronize_object_locked_cids' for accuracy --- src/hashstore/filehashstore.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1a643c09..ad572cf2 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -646,8 +646,8 @@ def tag_object(self, pid, cid): self._check_string(pid, "pid") self._check_string(cid, "cid") - # TODO: The pid should also be locked to ensure thread safety - self._synchronize_referenced_locked_cids(cid) + self._synchronize_referenced_locked_pids(pid) + self._synchronize_object_locked_cids(cid) try: # Prepare files and paths @@ -737,6 +737,7 @@ def tag_object(self, pid, cid): f"FileHashStore - tag_object: Releasing cid ({cid}) from" + " reference_locked_cids." ) + self._release_reference_locked_pids(pid) if self.use_multiprocessing: with self.object_cid_condition_mp: logging.debug(end_sync_debug_msg) @@ -2573,7 +2574,7 @@ def _release_object_locked_pids(self, pid): self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() - def _synchronize_referenced_locked_cids(self, cid): + def _synchronize_object_locked_cids(self, cid): """Multiple threads may access a data object via its 'cid' or the respective 'cid reference file' (which contains a list of 'pid's that reference a 'cid') and this needs to be coordinated. From 50838cc8082adafae12ba2a086f5866d2b5feb6c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 09:56:39 -0700 Subject: [PATCH 338/420] Refactor 'tag_object' by extracting new method '_release_object_locked_cids' --- src/hashstore/filehashstore.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ad572cf2..8ec8aa5e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -737,17 +737,9 @@ def tag_object(self, pid, cid): f"FileHashStore - tag_object: Releasing cid ({cid}) from" + " reference_locked_cids." ) + logging.debug(end_sync_debug_msg) + self._release_object_locked_cids(cid) self._release_reference_locked_pids(pid) - if self.use_multiprocessing: - with self.object_cid_condition_mp: - logging.debug(end_sync_debug_msg) - self.object_locked_cids_mp.remove(cid) - self.object_cid_condition_mp.notify() - else: - with self.object_cid_condition_th: - logging.debug(end_sync_debug_msg) - self.object_locked_cids_th.remove(cid) - self.object_cid_condition_th.notify() def store_metadata(self, pid, metadata, format_id=None): logging.debug( @@ -2608,6 +2600,21 @@ def _synchronize_object_locked_cids(self, cid): + f" cid: {cid}" ) + def _release_object_locked_cids(self, cid): + """Remove the given content identifier from 'object_locked_cids' and notify other + waiting threads or processes. + + :param str cid: Content identifier + """ + if self.use_multiprocessing: + with self.object_cid_condition_mp: + self.object_locked_cids_mp.remove(cid) + self.object_cid_condition_mp.notify() + else: + with self.object_cid_condition_th: + self.object_locked_cids_th.remove(cid) + self.object_cid_condition_th.notify() + def _synchronize_referenced_locked_pids(self, pid): """Multiple threads may interact with a pid (to tag, untag, delete) and these actions must be coordinated to prevent unexpected behaviour/race conditions that cause chaos. @@ -2642,7 +2649,7 @@ def _synchronize_referenced_locked_pids(self, pid): ) def _release_reference_locked_pids(self, pid): - """Remove the given persistent identifier from 'reference_locked_pids_' and notify other + """Remove the given persistent identifier from 'reference_locked_pids' and notify other waiting threads or processes. :param str pid: Persistent or authority-based identifier From 5d17f34849a7a9f4ac3da0bace9b69256023ef80 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 17:26:41 -0700 Subject: [PATCH 339/420] Add debug logging to release sync methods and refactor affected methods accordingly --- src/hashstore/filehashstore.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8ec8aa5e..8b87ecd4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -582,12 +582,7 @@ def store_object( raise err finally: # Release pid - end_sync_debug_msg = ( - f"FileHashStore - store_object: Releasing pid ({pid})" - + " from locked list" - ) self._release_object_locked_pids(pid) - logging.debug(end_sync_debug_msg) return object_metadata @@ -733,11 +728,6 @@ def tag_object(self, pid, cid): ) finally: # Release cid - end_sync_debug_msg = ( - f"FileHashStore - tag_object: Releasing cid ({cid}) from" - + " reference_locked_cids." - ) - logging.debug(end_sync_debug_msg) self._release_object_locked_cids(cid) self._release_reference_locked_pids(pid) @@ -2610,10 +2600,20 @@ def _release_object_locked_cids(self, cid): with self.object_cid_condition_mp: self.object_locked_cids_mp.remove(cid) self.object_cid_condition_mp.notify() + end_sync_debug_msg = ( + f"FileHashStore - _release_object_locked_cids: Releasing cid ({cid}) from" + + " object_cid_condition_mp." + ) + logging.debug(end_sync_debug_msg) else: with self.object_cid_condition_th: self.object_locked_cids_th.remove(cid) self.object_cid_condition_th.notify() + end_sync_debug_msg = ( + f"FileHashStore - _release_object_locked_cids: Releasing cid ({cid}) from" + + " object_cid_condition_th." + ) + logging.debug(end_sync_debug_msg) def _synchronize_referenced_locked_pids(self, pid): """Multiple threads may interact with a pid (to tag, untag, delete) and these actions @@ -2658,11 +2658,21 @@ def _release_reference_locked_pids(self, pid): with self.reference_pid_condition_mp: self.reference_locked_pids_mp.remove(pid) self.reference_pid_condition_mp.notify() + end_sync_debug_msg = ( + f"FileHashStore - _release_reference_locked_pids: Releasing pid ({pid}) from" + + " reference_locked_pids_mp." + ) + logging.debug(end_sync_debug_msg) else: # Release pid with self.reference_pid_condition_th: self.reference_locked_pids_th.remove(pid) self.reference_pid_condition_th.notify() + end_sync_debug_msg = ( + f"FileHashStore - _release_reference_locked_pids: Releasing pid ({pid}) from" + + " reference_locked_pids_th." + ) + logging.debug(end_sync_debug_msg) @staticmethod def _get_file_paths(directory): From c4124da61ef0fd1edb0597b7d5cc0ebcb1cbc479 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 17:32:14 -0700 Subject: [PATCH 340/420] Rename 'delete_invalid_object' to 'delete_if_invalid_object' and update affected code --- README.md | 4 ++-- src/hashstore/filehashstore.py | 2 +- src/hashstore/hashstore.py | 2 +- tests/test_filehashstore_interface.py | 16 ++++++++-------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 241990d1..d4feaf10 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ store and tag an object simultaneously if the relevant data is available. In the identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. The client is then expected to call -`delete_invalid_object` when the relevant metadata is available to confirm that the object is +`delete_if_invalid_object` when the relevant metadata is available to confirm that the object is what is expected. And to finalize the process (to make the object discoverable), the client calls `tagObject``. In summary, there are two expected paths to store an object: @@ -204,7 +204,7 @@ obj_info_allinone = hashstore.store_object(input_stream, pid, additional_algo, c # Store object obj_info_manual = hashstore.store_object(input_stream) # Validate object with expected values when available -hashstore.delete_invalid_object(obj_info_manual, checksum, checksum_algo, obj_size) +hashstore.delete_if_invalid_object(obj_info_manual, checksum, checksum_algo, obj_size) # Tag object, makes the object discoverable (find, retrieve, delete) hashstore.tag_object(pid, obj_info_manual.cid) ``` diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8b87ecd4..bfc921e1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -586,7 +586,7 @@ def store_object( return object_metadata - def delete_invalid_object( + def delete_if_invalid_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): self._check_string(checksum, "checksum") diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index f09974ac..20a93fd8 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -134,7 +134,7 @@ def delete_object(self, pid): raise NotImplementedError() @abstractmethod - def delete_invalid_object( + def delete_if_invalid_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): """Confirm equality of content in an ObjectMetadata. The `delete_invalid_object` method diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 8bf35ab8..a2dba6dc 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1112,7 +1112,7 @@ def test_delete_invalid_object(pids, store): checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size - store.delete_invalid_object( + store.delete_if_invalid_object( object_metadata, checksum, checksum_algorithm, expected_file_size ) assert store._exists("objects", object_metadata.cid) @@ -1127,7 +1127,7 @@ def test_delete_invalid_object_supported_other_algo_not_in_default(pids, store): object_metadata = store.store_object(data=path) checksum = pids[pid][supported_algo] expected_file_size = object_metadata.obj_size - store.delete_invalid_object( + store.delete_if_invalid_object( object_metadata, checksum, supported_algo, expected_file_size ) assert store._exists("objects", object_metadata.cid) @@ -1144,7 +1144,7 @@ def test_delete_invalid_object_exception_incorrect_object_metadata_type(pids, st checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size with pytest.raises(ValueError): - store.delete_invalid_object( + store.delete_if_invalid_object( "not_object_metadata", checksum, checksum_algorithm, expected_file_size ) @@ -1160,7 +1160,7 @@ def test_delete_invalid_object_exception_incorrect_size(pids, store): checksum_algorithm = store.algorithm with pytest.raises(NonMatchingObjSize): - store.delete_invalid_object( + store.delete_if_invalid_object( object_metadata, checksum, checksum_algorithm, 1000 ) @@ -1182,7 +1182,7 @@ def test_delete_invalid_object_exception_incorrect_size_object_exists(pids, stor checksum_algorithm = store.algorithm with pytest.raises(NonMatchingObjSize): - store.delete_invalid_object( + store.delete_if_invalid_object( object_metadata, checksum, checksum_algorithm, 1000 ) @@ -1200,7 +1200,7 @@ def test_delete_invalid_object_exception_incorrect_checksum(pids, store): expected_file_size = object_metadata.obj_size with pytest.raises(NonMatchingChecksum): - store.delete_invalid_object( + store.delete_if_invalid_object( object_metadata, "abc123", checksum_algorithm, expected_file_size ) @@ -1216,7 +1216,7 @@ def test_delete_invalid_object_exception_incorrect_checksum_algo(pids, store): checksum = object_metadata.hex_digests.get(store.algorithm) expected_file_size = object_metadata.obj_size with pytest.raises(UnsupportedAlgorithm): - store.delete_invalid_object( + store.delete_if_invalid_object( object_metadata, checksum, "md2", expected_file_size ) @@ -1233,7 +1233,7 @@ def test_delete_invalid_object_exception_supported_other_algo_bad_checksum(pids, checksum = object_metadata.hex_digests.get(store.algorithm) expected_file_size = object_metadata.obj_size with pytest.raises(NonMatchingChecksum): - store.delete_invalid_object( + store.delete_if_invalid_object( object_metadata, checksum, "sha224", expected_file_size ) From e37aa17be632b9c27721de23ab334296035b31ce Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 17:33:52 -0700 Subject: [PATCH 341/420] Refactor 'tag_object' by moving sync methods into try block --- src/hashstore/filehashstore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bfc921e1..90d8f79f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -641,10 +641,10 @@ def tag_object(self, pid, cid): self._check_string(pid, "pid") self._check_string(cid, "cid") - self._synchronize_referenced_locked_pids(pid) - self._synchronize_object_locked_cids(cid) - try: + self._synchronize_referenced_locked_pids(pid) + self._synchronize_object_locked_cids(cid) + # Prepare files and paths tmp_root_path = self._get_store_path("refs") / "tmp" pid_refs_path = self._get_hashstore_pid_refs_path(pid) From a7691306e6d7fc176152ae46c45c4fe0bc447c5a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 17:46:37 -0700 Subject: [PATCH 342/420] Refactor 'tag_object' by extracing method 'store_hashstore_refs_files' and catching specific exceptions to improve clarity for expected possible exceptions --- src/hashstore/filehashstore.py | 194 ++++++++++++++++++--------------- 1 file changed, 108 insertions(+), 86 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 90d8f79f..5a75d881 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -642,94 +642,19 @@ def tag_object(self, pid, cid): self._check_string(cid, "cid") try: - self._synchronize_referenced_locked_pids(pid) - self._synchronize_object_locked_cids(cid) - - # Prepare files and paths - tmp_root_path = self._get_store_path("refs") / "tmp" - pid_refs_path = self._get_hashstore_pid_refs_path(pid) - cid_refs_path = self._get_hashstore_cid_refs_path(cid) - # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(Path(os.path.dirname(pid_refs_path))) - self._create_path(Path(os.path.dirname(cid_refs_path))) - - if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): - # If both reference files exist, we confirm that reference files are where they - # are expected to be and throw an exception to inform the client that everything - # is in place - and include other issues for context - err_msg = ( - f"FileHashStore - tag_object: Object with cid: {cid}" - + f" already exists and is tagged with pid: {pid}." - ) - try: - self._verify_hashstore_references( - pid, - cid, - pid_refs_path, - cid_refs_path, - "Refs file already exists, verifying.", - ) - logging.error(err_msg) - raise HashStoreRefsAlreadyExists(err_msg) - except Exception as e: - rev_msg = err_msg + " " + str(e) - logging.error(rev_msg) - raise HashStoreRefsAlreadyExists(err_msg) - - elif os.path.exists(pid_refs_path) and not os.path.exists(cid_refs_path): - # If pid refs exists, the pid has already been claimed and cannot be tagged we - # throw an exception immediately - error_msg = ( - f"FileHashStore - tag_object: Pid refs file already exists for pid: {pid}." - + " A pid can only reference one cid. " - ) - logging.error(error_msg) - raise PidRefsAlreadyExistsError(error_msg) - - elif not os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): - debug_msg = ( - f"FileHashStore - tag_object: pid refs file does not exist for pid {pid}" - + f" but cid refs file found at: {cid_refs_path} for cid: {cid}" - ) - logging.debug(debug_msg) - # Move the pid refs file - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - shutil.move(pid_tmp_file_path, pid_refs_path) - # Update cid ref files as it already exists - if not self._is_string_in_refs_file(pid, cid_refs_path): - self._update_refs_file(cid_refs_path, pid, "add") - self._verify_hashstore_references( - pid, - cid, - pid_refs_path, - cid_refs_path, - f"Updated existing cid refs file: {cid_refs_path} with pid: {pid}", - ) - logging.info( - "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", - cid, - pid, - ) - return - - # Move both files after checking the existing status of refs files - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - shutil.move(pid_tmp_file_path, pid_refs_path) - shutil.move(cid_tmp_file_path, cid_refs_path) - log_msg = "Reference files have been moved to their permanent location. Verifying refs." - self._verify_hashstore_references( - pid, cid, pid_refs_path, cid_refs_path, log_msg + self.store_hashstore_refs_files(cid, pid) + except HashStoreRefsAlreadyExists as hrae: + err_msg = ( + f"FileHashStore - tag_object: reference files for pid: {pid} and {cid} " + "already exist. " + str(hrae) ) - logging.info( - "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", - cid, - pid, + raise HashStoreRefsAlreadyExists(err_msg) + except PidRefsAlreadyExistsError as praee: + err_msg = ( + f"FileHashStore - tag_object: A pid can only reference one cid. " + + str(praee) ) - finally: - # Release cid - self._release_object_locked_cids(cid) - self._release_reference_locked_pids(pid) + raise PidRefsAlreadyExistsError(err_msg) def store_metadata(self, pid, metadata, format_id=None): logging.debug( @@ -1681,6 +1606,103 @@ def delete_tmp_file(): os.umask(oldmask) return tmp + def store_hashstore_refs_files(self, cid, pid): + """Create the pid refs file and create/update cid refs files in HashStore to establish + the relationship between a 'pid' and a 'cid'. + + :param str cid: Content identifier + :param str pid: Persistent or authority-based identifier. + """ + try: + self._synchronize_referenced_locked_pids(pid) + self._synchronize_object_locked_cids(cid) + + # Prepare files and paths + tmp_root_path = self._get_store_path("refs") / "tmp" + pid_refs_path = self._get_hashstore_pid_refs_path(pid) + cid_refs_path = self._get_hashstore_cid_refs_path(cid) + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' + self._create_path(Path(os.path.dirname(pid_refs_path))) + self._create_path(Path(os.path.dirname(cid_refs_path))) + + if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): + # If both reference files exist, we confirm that reference files are where they + # are expected to be and throw an exception to inform the client that everything + # is in place - and include other issues for context + err_msg = ( + f"FileHashStore - store_hashstore_refs_files: Object with cid: {cid}" + f" already exists and is tagged with pid: {pid}." + ) + try: + self._verify_hashstore_references( + pid, + cid, + pid_refs_path, + cid_refs_path, + "Refs file already exists, verifying.", + ) + logging.error(err_msg) + raise HashStoreRefsAlreadyExists(err_msg) + except Exception as e: + rev_msg = err_msg + " " + str(e) + logging.error(rev_msg) + raise HashStoreRefsAlreadyExists(err_msg) + + elif os.path.exists(pid_refs_path) and not os.path.exists(cid_refs_path): + # If pid refs exists, the pid has already been claimed and cannot be tagged we + # throw an exception immediately + error_msg = ( + f"FileHashStore - store_hashstore_refs_files: Pid refs file already exists" + f" for pid: {pid}." + ) + logging.error(error_msg) + raise PidRefsAlreadyExistsError(error_msg) + + elif not os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): + debug_msg = ( + f"FileHashStore - store_hashstore_refs_files: pid refs file does not exist" + f" for pid {pid} but cid refs file found at: {cid_refs_path} for cid: {cid}" + ) + logging.debug(debug_msg) + # Move the pid refs file + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + shutil.move(pid_tmp_file_path, pid_refs_path) + # Update cid ref files as it already exists + if not self._is_string_in_refs_file(pid, cid_refs_path): + self._update_refs_file(cid_refs_path, pid, "add") + self._verify_hashstore_references( + pid, + cid, + pid_refs_path, + cid_refs_path, + f"Updated existing cid refs file: {cid_refs_path} with pid: {pid}", + ) + info_msg = ( + "FileHashStore - store_hashstore_refs_files: Successfully updated " + f"cid: {cid} with pid: {pid}" + ) + logging.info(info_msg) + return + + # Move both files after checking the existing status of refs files + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + shutil.move(pid_tmp_file_path, pid_refs_path) + shutil.move(cid_tmp_file_path, cid_refs_path) + log_msg = "Reference files have been moved to their permanent location. Verifying refs." + self._verify_hashstore_references( + pid, cid, pid_refs_path, cid_refs_path, log_msg + ) + info_msg = ( + "FileHashStore - store_hashstore_refs_files: Successfully updated " + f"cid: {cid} with pid: {pid}" + ) + logging.info(info_msg) + finally: + # Release cid + self._release_object_locked_cids(cid) + self._release_reference_locked_pids(pid) + def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the From 333303d621885a2e94867c7dc278953a6d6d2c30 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 17:51:34 -0700 Subject: [PATCH 343/420] Refactor 'store_hashstore_refs_files' to catch and re-raise specific exceptions, and to catch all other exceptions to begin untagging process (todo item) --- src/hashstore/filehashstore.py | 143 ++++++++++++++++++--------------- 1 file changed, 78 insertions(+), 65 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5a75d881..6743f4f6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1275,7 +1275,7 @@ def _find_object(self, pid): raise PidRefsDoesNotExist(err_msg) def _store_data_only(self, data): - """Store an object to HashStore and return the a metadata object containing the content + """Store an object to HashStore and return a metadata object containing the content identifier, object file size and hex digests dictionary of the default algorithms. This method does not validate the object and writes directly to `/objects` after the hex digests are calculated. @@ -1617,87 +1617,100 @@ def store_hashstore_refs_files(self, cid, pid): self._synchronize_referenced_locked_pids(pid) self._synchronize_object_locked_cids(cid) - # Prepare files and paths - tmp_root_path = self._get_store_path("refs") / "tmp" - pid_refs_path = self._get_hashstore_pid_refs_path(pid) - cid_refs_path = self._get_hashstore_cid_refs_path(cid) - # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(Path(os.path.dirname(pid_refs_path))) - self._create_path(Path(os.path.dirname(cid_refs_path))) - - if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): - # If both reference files exist, we confirm that reference files are where they - # are expected to be and throw an exception to inform the client that everything - # is in place - and include other issues for context - err_msg = ( - f"FileHashStore - store_hashstore_refs_files: Object with cid: {cid}" - f" already exists and is tagged with pid: {pid}." - ) - try: + try: + # Prepare files and paths + tmp_root_path = self._get_store_path("refs") / "tmp" + pid_refs_path = self._get_hashstore_pid_refs_path(pid) + cid_refs_path = self._get_hashstore_cid_refs_path(cid) + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' + self._create_path(Path(os.path.dirname(pid_refs_path))) + self._create_path(Path(os.path.dirname(cid_refs_path))) + + if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): + # If both reference files exist, we confirm that reference files are where they + # are expected to be and throw an exception to inform the client that everything + # is in place - and include other issues for context + err_msg = ( + f"FileHashStore - store_hashstore_refs_files: Object with cid: {cid}" + f" already exists and is tagged with pid: {pid}." + ) + try: + self._verify_hashstore_references( + pid, + cid, + pid_refs_path, + cid_refs_path, + "Refs file already exists, verifying.", + ) + logging.error(err_msg) + raise HashStoreRefsAlreadyExists(err_msg) + except Exception as e: + rev_msg = err_msg + " " + str(e) + logging.error(rev_msg) + raise HashStoreRefsAlreadyExists(err_msg) + + elif os.path.exists(pid_refs_path) and not os.path.exists( + cid_refs_path + ): + # If pid refs exists, the pid has already been claimed and cannot be tagged we + # throw an exception immediately + error_msg = ( + f"FileHashStore - store_hashstore_refs_files: Pid refs file already exists" + f" for pid: {pid}." + ) + logging.error(error_msg) + raise PidRefsAlreadyExistsError(error_msg) + + elif not os.path.exists(pid_refs_path) and os.path.exists( + cid_refs_path + ): + debug_msg = ( + f"FileHashStore - store_hashstore_refs_files: pid refs file does not exist" + f" for pid {pid} but cid refs file found at: {cid_refs_path} for cid: {cid}" + ) + logging.debug(debug_msg) + # Move the pid refs file + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + shutil.move(pid_tmp_file_path, pid_refs_path) + # Update cid ref files as it already exists + if not self._is_string_in_refs_file(pid, cid_refs_path): + self._update_refs_file(cid_refs_path, pid, "add") self._verify_hashstore_references( pid, cid, pid_refs_path, cid_refs_path, - "Refs file already exists, verifying.", + f"Updated existing cid refs file: {cid_refs_path} with pid: {pid}", ) - logging.error(err_msg) - raise HashStoreRefsAlreadyExists(err_msg) - except Exception as e: - rev_msg = err_msg + " " + str(e) - logging.error(rev_msg) - raise HashStoreRefsAlreadyExists(err_msg) - - elif os.path.exists(pid_refs_path) and not os.path.exists(cid_refs_path): - # If pid refs exists, the pid has already been claimed and cannot be tagged we - # throw an exception immediately - error_msg = ( - f"FileHashStore - store_hashstore_refs_files: Pid refs file already exists" - f" for pid: {pid}." - ) - logging.error(error_msg) - raise PidRefsAlreadyExistsError(error_msg) + info_msg = ( + "FileHashStore - store_hashstore_refs_files: Successfully updated " + f"cid: {cid} with pid: {pid}" + ) + logging.info(info_msg) + return - elif not os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): - debug_msg = ( - f"FileHashStore - store_hashstore_refs_files: pid refs file does not exist" - f" for pid {pid} but cid refs file found at: {cid_refs_path} for cid: {cid}" - ) - logging.debug(debug_msg) - # Move the pid refs file + # Move both files after checking the existing status of refs files pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") shutil.move(pid_tmp_file_path, pid_refs_path) - # Update cid ref files as it already exists - if not self._is_string_in_refs_file(pid, cid_refs_path): - self._update_refs_file(cid_refs_path, pid, "add") + shutil.move(cid_tmp_file_path, cid_refs_path) + log_msg = "Refs files have been moved to their permanent location. Verifying refs." self._verify_hashstore_references( - pid, - cid, - pid_refs_path, - cid_refs_path, - f"Updated existing cid refs file: {cid_refs_path} with pid: {pid}", + pid, cid, pid_refs_path, cid_refs_path, log_msg ) info_msg = ( "FileHashStore - store_hashstore_refs_files: Successfully updated " f"cid: {cid} with pid: {pid}" ) logging.info(info_msg) - return - # Move both files after checking the existing status of refs files - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - shutil.move(pid_tmp_file_path, pid_refs_path) - shutil.move(cid_tmp_file_path, cid_refs_path) - log_msg = "Reference files have been moved to their permanent location. Verifying refs." - self._verify_hashstore_references( - pid, cid, pid_refs_path, cid_refs_path, log_msg - ) - info_msg = ( - "FileHashStore - store_hashstore_refs_files: Successfully updated " - f"cid: {cid} with pid: {pid}" - ) - logging.info(info_msg) + except HashStoreRefsAlreadyExists or PidRefsAlreadyExistsError as expected_exceptions: + raise expected_exceptions + + except Exception as unexpected_exception: + # TODO: Untagobject + raise unexpected_exception + finally: # Release cid self._release_object_locked_cids(cid) From c5891d0c4faf8cf25711e5e0cf89f13d422496a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 13 Sep 2024 17:57:38 -0700 Subject: [PATCH 344/420] Add todo item in pytests for 'tag_object' and 'store_hashstore_refs_files' --- tests/test_filehashstore_references.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index d1178ba6..8a7e3743 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -15,6 +15,8 @@ # pylint: disable=W0212 +# TODO: Review 'tag_object' tests and split them into relevant 'store_hashstore_refs_files' tests + def test_tag_object(pids, store): """Test tag_object does not throw exception when successful.""" From 61b320cee7a06eadcc7d1b4a63262e8c247c8dda Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 09:11:27 -0700 Subject: [PATCH 345/420] Format & revise '_move_and_get_checksums' docstrings to resolve linting warning --- src/hashstore/filehashstore.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6743f4f6..3ca09906 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1343,13 +1343,12 @@ def _move_and_get_checksums( validate the object (and delete the tmpFile if the hex digest stored does not match what is provided). - :param str pid: Authority-based identifier. - :param Stream stream: Object stream. - when saving. - :param str additional_algorithm: Optional algorithm value to include - when returning hex digests. - :param str checksum: Optional checksum to validate the object - against hex digest before moving to the permanent location. + :param Optional[str] pid: Authority-based identifier. + :param Stream stream: Object stream when saving. + :param str additional_algorithm: Optional algorithm value to include when returning hex + digests. + :param str checksum: Optional checksum to validate the object against hex digest before + moving to the permanent location. :param str checksum_algorithm: Algorithm value of the given checksum. :param int file_size_to_validate: Expected size of the object. From 576ee39c78d2bfc6bd50b4d14cde02ee87fd03ee Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 09:59:07 -0700 Subject: [PATCH 346/420] Add new function '_untag_object' with docstring --- src/hashstore/filehashstore.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3ca09906..b5e90589 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1708,6 +1708,8 @@ def store_hashstore_refs_files(self, cid, pid): except Exception as unexpected_exception: # TODO: Untagobject + # For all other unexpected exceptions, we are to revert the tagging process as + # much as possible. No exceptions from the reverting process will be thrown. raise unexpected_exception finally: @@ -1715,6 +1717,16 @@ def store_hashstore_refs_files(self, cid, pid): self._release_object_locked_cids(cid) self._release_reference_locked_pids(pid) + def _untag_object(self, pid, cid): + """Untags a data object in HashStore by deleting the 'pid reference file' and removing + the 'pid' from the 'cid reference file'. This method will never delete a data + object. _untag_object will attempt to proceed with as much of the untagging process as + possible and swallow relevant exceptions. + + :param str cid: Content identifier + :param str pid: Persistent or authority-based identifier. + """ + def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the From 551d154a9ec7febd676450fcb939c807d4bc76b9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 10:40:56 -0700 Subject: [PATCH 347/420] Add new custom exception 'IdentifierNotLocked', add new sync function '_check_reference_locked_pids' and update 'untag_object' --- src/hashstore/filehashstore.py | 26 +++++++++++++++++++++++ src/hashstore/filehashstore_exceptions.py | 9 ++++++++ 2 files changed, 35 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b5e90589..a00b9706 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -31,6 +31,7 @@ RefsFileExistsButCidObjMissing, UnsupportedAlgorithm, StoreObjectForPidAlreadyInProgress, + IdentifierNotLocked, ) @@ -1726,6 +1727,14 @@ def _untag_object(self, pid, cid): :param str cid: Content identifier :param str pid: Persistent or authority-based identifier. """ + self._check_string(pid, "pid") + self._check_string(cid, "cid") + + delete_list = [] + + # To untag a pid, the pid must be found and currently locked + # The pid will not be released until this process is over + self._check_reference_locked_pids(pid) def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. @@ -2694,6 +2703,23 @@ def _synchronize_referenced_locked_pids(self, pid): + f" for pid: {pid}" ) + def _check_reference_locked_pids(self, pid): + """Check that a given persistent identifier is currently locked (found in + 'reference_locked_pids' array). If it is not, an exception will be thrown. + + :param str pid: Persistent or authority-based identifier + """ + if self.use_multiprocessing: + if pid not in self.reference_locked_pids_mp: + err_msg = f"_check_reference_locked_pids: pid {pid} is not locked." + logging.error(err_msg) + raise IdentifierNotLocked(err_msg) + else: + if pid not in self.reference_locked_pids_th: + err_msg = f"_check_reference_locked_pids: pid {pid} is not locked." + logging.error(err_msg) + raise IdentifierNotLocked(err_msg) + def _release_reference_locked_pids(self, pid): """Remove the given persistent identifier from 'reference_locked_pids' and notify other waiting threads or processes. diff --git a/src/hashstore/filehashstore_exceptions.py b/src/hashstore/filehashstore_exceptions.py index 65e52139..1453430a 100644 --- a/src/hashstore/filehashstore_exceptions.py +++ b/src/hashstore/filehashstore_exceptions.py @@ -11,6 +11,15 @@ def __init__(self, message, errors=None): self.errors = errors +class IdentifierNotLocked(Exception): + """Custom exception thrown when an identifier (ex. 'pid' or 'cid') is not locked, which is + required to ensure thread safety.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors + + class CidRefsContentError(Exception): """Custom exception thrown when verifying reference files and a cid refs file does not have a pid that is expected to be found.""" From f0881a4d59e6e0c9974efdb12d7e5b9ca692c799 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 10:57:52 -0700 Subject: [PATCH 348/420] Rename 'CidRefsDoesNotExist' custom exception to 'OrphanPidRefsFileFound' for clarity, update pytests and add skeleton code to 'untag_object' --- src/hashstore/filehashstore.py | 30 ++++++++++++++++++++--- src/hashstore/filehashstore_exceptions.py | 2 +- tests/test_filehashstore.py | 4 +-- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a00b9706..94a7a310 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -18,7 +18,7 @@ from hashstore import HashStore from hashstore.filehashstore_exceptions import ( CidRefsContentError, - CidRefsDoesNotExist, + OrphanPidRefsFileFound, CidRefsFileNotFound, HashStoreRefsAlreadyExists, NonMatchingChecksum, @@ -915,7 +915,7 @@ def delete_object(self, pid): for obj in objects_to_delete: os.remove(obj) return - except CidRefsDoesNotExist: + except OrphanPidRefsFileFound: # Delete pid refs file pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) objects_to_delete.append( @@ -1266,7 +1266,7 @@ def _find_object(self, pid): + f", but cid refs file not found: {cid_ref_abs_path} for pid: {pid}" ) logging.error(err_msg) - raise CidRefsDoesNotExist(err_msg) + raise OrphanPidRefsFileFound(err_msg) else: err_msg = ( f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " @@ -1736,6 +1736,30 @@ def _untag_object(self, pid, cid): # The pid will not be released until this process is over self._check_reference_locked_pids(pid) + # Before we begin the untagging process, we look for the `cid` by calling `find_object` + # which will throw custom exceptions if there is an issue with the reference files, + # which help us determine the path to proceed with. + try: + # TODO: find_object + # Check and validate cid + # Remove pid refs + # Remove pid from cid refs + # delete files + return + except OrphanPidRefsFileFound as oprff: + # TODO: Handle orphan pid refs + return + except RefsFileExistsButCidObjMissing as rfebcom: + # TODO: Handle refs existing but data obj missing + return + except PidNotFoundInCidRefsFile as pnficrf: + # TODO: Handle refs exist but pid is not found in cid refs + return + except PidRefsDoesNotExist as prdne: + # TODO: Handle cid refs to ensure pid not found in it + return + + def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the diff --git a/src/hashstore/filehashstore_exceptions.py b/src/hashstore/filehashstore_exceptions.py index 1453430a..7acb77f8 100644 --- a/src/hashstore/filehashstore_exceptions.py +++ b/src/hashstore/filehashstore_exceptions.py @@ -38,7 +38,7 @@ def __init__(self, message, errors=None): self.errors = errors -class CidRefsDoesNotExist(Exception): +class OrphanPidRefsFileFound(Exception): """Custom exception thrown when a cid refs file does not exist.""" def __init__(self, message, errors=None): diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 8931b02c..51da400d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -6,7 +6,7 @@ import pytest from hashstore.filehashstore import FileHashStore, ObjectMetadata from hashstore.filehashstore_exceptions import ( - CidRefsDoesNotExist, + OrphanPidRefsFileFound, NonMatchingChecksum, NonMatchingObjSize, PidNotFoundInCidRefsFile, @@ -864,7 +864,7 @@ def test_find_object_cid_refs_not_found(pids, store): pid_ref_file.write("intentionally.wrong.pid") pid_ref_file.truncate() - with pytest.raises(CidRefsDoesNotExist): + with pytest.raises(OrphanPidRefsFileFound): store._find_object(pid) From 7f74762493274cd6f6b0639f679aa16474fb6f22 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 13:40:15 -0700 Subject: [PATCH 349/420] Add new functions '_check_object_locked_cids' and '_validate_and_check_cid_lock', update 'untag_object' and add new todo pytest items --- src/hashstore/filehashstore.py | 41 ++++++++++++++++++++++++++++++++-- tests/test_filehashstore.py | 5 +++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 94a7a310..91532c23 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1741,7 +1741,9 @@ def _untag_object(self, pid, cid): # which help us determine the path to proceed with. try: # TODO: find_object - # Check and validate cid + obj_info_dict = self._find_object(pid) + cid_to_check = obj_info_dict["cid"] + self._validate_and_check_cid_lock(pid, cid, cid_to_check) # Remove pid refs # Remove pid from cid refs # delete files @@ -1759,6 +1761,24 @@ def _untag_object(self, pid, cid): # TODO: Handle cid refs to ensure pid not found in it return + def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): + """Confirm that the two content identifiers provided are equal and is locked to ensure + thread safety. + + :param str pid: Persistent identifier + :param str cid: Content identifier + :param str cid_to_check: Cid that was retrieved or read + """ + self._check_string(cid, "cid") + self._check_string(cid_to_check, "cid_to_check") + + if cid is not cid_to_check: + err_msg = ( + f"_validate_and_check_cid_lock: cid provided: {cid_to_check} does not " + f"match untag request for cid: {cid} and pid: {pid}" + ) + raise ValueError(err_msg) + self._check_object_locked_cids(cid) def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. @@ -2669,6 +2689,23 @@ def _synchronize_object_locked_cids(self, cid): + f" cid: {cid}" ) + def _check_object_locked_cids(self, cid): + """Check that a given content identifier is currently locked (found in the + 'object_locked_cids' array). If it is not, an exception will be thrown. + + :param str cid: Content identifier + """ + if self.use_multiprocessing: + if cid not in self.object_locked_cids_mp: + err_msg = f"_check_object_locked_cids: cid {cid} is not locked." + logging.error(err_msg) + raise IdentifierNotLocked(err_msg) + else: + if cid not in self.object_locked_cids_th: + err_msg = f"_check_object_locked_cids: cid {cid} is not locked." + logging.error(err_msg) + raise IdentifierNotLocked(err_msg) + def _release_object_locked_cids(self, cid): """Remove the given content identifier from 'object_locked_cids' and notify other waiting threads or processes. @@ -2728,7 +2765,7 @@ def _synchronize_referenced_locked_pids(self, pid): ) def _check_reference_locked_pids(self, pid): - """Check that a given persistent identifier is currently locked (found in + """Check that a given persistent identifier is currently locked (found in the 'reference_locked_pids' array). If it is not, an exception will be thrown. :param str pid: Persistent or authority-based identifier diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 51da400d..7bd17af2 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1067,6 +1067,11 @@ def test_delete_with_object_metadata_id(pids, store): assert store._count(entity) == 0 +# TODO: Add untag pytest for pid and cid successfully untagged +# TODO: Add untag pytest for exception thrown when pid is not locked +# TODO: Add untag pytest for exception thrown when cid is not locked + + def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: From 0c8138eab86b6ef20ea6ea5d31779353bcf40c77 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 14:41:59 -0700 Subject: [PATCH 350/420] Make 'store_hashstore_refs_files' private and fill out main flow of 'untag_object' function --- src/hashstore/filehashstore.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 91532c23..990ac6b3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -643,7 +643,7 @@ def tag_object(self, pid, cid): self._check_string(cid, "cid") try: - self.store_hashstore_refs_files(cid, pid) + self._store_hashstore_refs_files(cid, pid) except HashStoreRefsAlreadyExists as hrae: err_msg = ( f"FileHashStore - tag_object: reference files for pid: {pid} and {cid} " @@ -1606,7 +1606,7 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - def store_hashstore_refs_files(self, cid, pid): + def _store_hashstore_refs_files(self, cid, pid): """Create the pid refs file and create/update cid refs files in HashStore to establish the relationship between a 'pid' and a 'cid'. @@ -1730,7 +1730,7 @@ def _untag_object(self, pid, cid): self._check_string(pid, "pid") self._check_string(cid, "cid") - delete_list = [] + untag_obj_delete_list = [] # To untag a pid, the pid must be found and currently locked # The pid will not be released until this process is over @@ -1740,13 +1740,19 @@ def _untag_object(self, pid, cid): # which will throw custom exceptions if there is an issue with the reference files, # which help us determine the path to proceed with. try: - # TODO: find_object obj_info_dict = self._find_object(pid) cid_to_check = obj_info_dict["cid"] self._validate_and_check_cid_lock(pid, cid, cid_to_check) # Remove pid refs + pid_refs_path_str = obj_info_dict.get("pid_refs_path") + untag_obj_delete_list.append( + self._rename_path_for_deletion(pid_refs_path_str) + ) # Remove pid from cid refs - # delete files + # TODO + # Remove all files confirmed for deletion + for obj in untag_obj_delete_list: + os.remove(obj) return except OrphanPidRefsFileFound as oprff: # TODO: Handle orphan pid refs From 070194d74b5b859e845437ca4b33338fab34e369 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 15:04:42 -0700 Subject: [PATCH 351/420] Revise '_store_hashstore_refs_files' function signature order & update code, add pytests and resolve todo items in 'references' test module --- src/hashstore/filehashstore.py | 6 +- tests/test_filehashstore.py | 106 +++++++++++++++++++++++++ tests/test_filehashstore_references.py | 90 +-------------------- 3 files changed, 113 insertions(+), 89 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 990ac6b3..aa39261a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -643,7 +643,7 @@ def tag_object(self, pid, cid): self._check_string(cid, "cid") try: - self._store_hashstore_refs_files(cid, pid) + self._store_hashstore_refs_files(pid, cid) except HashStoreRefsAlreadyExists as hrae: err_msg = ( f"FileHashStore - tag_object: reference files for pid: {pid} and {cid} " @@ -1606,12 +1606,12 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - def _store_hashstore_refs_files(self, cid, pid): + def _store_hashstore_refs_files(self, pid, cid): """Create the pid refs file and create/update cid refs files in HashStore to establish the relationship between a 'pid' and a 'cid'. - :param str cid: Content identifier :param str pid: Persistent or authority-based identifier. + :param str cid: Content identifier """ try: self._synchronize_referenced_locked_pids(pid) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 7bd17af2..9f36bebc 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -13,6 +13,8 @@ PidRefsDoesNotExist, RefsFileExistsButCidObjMissing, UnsupportedAlgorithm, + HashStoreRefsAlreadyExists, + PidRefsAlreadyExistsError, ) @@ -622,6 +624,110 @@ def test_mktmpfile(store): assert os.path.exists(tmp.name) +def test_store_hashstore_refs_files_(pids, store): + """Test _store_hashstore_refs_files does not throw exception when successful.""" + for pid in pids.keys(): + cid = pids[pid][store.algorithm] + store._store_hashstore_refs_files(pid, cid) + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + +def test_store_hashstore_refs_files_pid_refs_file_exists(pids, store): + """Test _store_hashstore_refs_file creates the expected pid reference file.""" + for pid in pids.keys(): + cid = pids[pid][store.algorithm] + store._store_hashstore_refs_files(pid, cid) + pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) + assert os.path.exists(pid_refs_file_path) + + +def test_store_hashstore_refs_file_cid_refs_file_exists(pids, store): + """Test _store_hashstore_refs_file creates the cid reference file.""" + for pid in pids.keys(): + cid = pids[pid][store.algorithm] + store._store_hashstore_refs_files(pid, cid) + cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) + assert os.path.exists(cid_refs_file_path) + + +def test_store_hashstore_refs_file_pid_refs_file_content(pids, store): + """Test _store_hashstore_refs_file created the pid reference file with the expected cid.""" + for pid in pids.keys(): + cid = pids[pid][store.algorithm] + store._store_hashstore_refs_files(pid, cid) + pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) + with open(pid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + assert pid_refs_cid == cid + + +def test_store_hashstore_refs_file_cid_refs_file_content(pids, store): + """Test _store_hashstore_refs_file creates the cid reference file successfully with pid + tagged.""" + for pid in pids.keys(): + cid = pids[pid][store.algorithm] + store._store_hashstore_refs_files(pid, cid) + cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) + with open(cid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read().strip() + assert pid_refs_cid == pid + + +def test_store_hashstore_refs_file_pid_refs_found_cid_refs_found(pids, store): + """Test _store_hashstore_refs_file does not throw an exception when any refs file already exists + and verifies the content, and does not double tag the cid refs file.""" + for pid in pids.keys(): + cid = pids[pid][store.algorithm] + store._store_hashstore_refs_files(pid, cid) + + with pytest.raises(HashStoreRefsAlreadyExists): + store.tag_object(pid, cid) + + cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) + line_count = 0 + with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: + for _line in ref_file: + line_count += 1 + assert line_count == 1 + + +def test_store_hashstore_refs_files_pid_refs_found_cid_refs_not_found(store, pids): + """Test that _store_hashstore_refs_files throws an exception when pid refs file exists, + contains a different cid, and is correctly referenced in the associated cid refs file""" + for pid in pids.keys(): + cid = pids[pid][store.algorithm] + store._store_hashstore_refs_files(pid, cid) + + with pytest.raises(PidRefsAlreadyExistsError): + store._store_hashstore_refs_files( + pid, "another_cid_value_that_is_not_found" + ) + + +def test_store_hashstore_refs_files_refs_not_found_cid_refs_found(store): + """Test _store_hashstore_refs_files updates a cid reference file that already exists.""" + pid = "jtao.1700.1" + cid = "94f9b6c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a" + # Tag object + store._store_hashstore_refs_files(pid, cid) + # Tag the cid with another pid + additional_pid = "dou.test.1" + store._store_hashstore_refs_files(additional_pid, cid) + + # Read cid file to confirm cid refs file contains the additional pid + line_count = 0 + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + line_count += 1 + assert value == pid or value == additional_pid + assert line_count == 2 + assert store._count("pid") == 2 + assert store._count("cid") == 1 + + def test_put_metadata_with_path(pids, store): """Test _put_metadata with path object for the path arg.""" entity = "metadata" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 8a7e3743..7319758d 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -7,16 +7,12 @@ from hashstore.filehashstore_exceptions import ( CidRefsContentError, CidRefsFileNotFound, - HashStoreRefsAlreadyExists, - PidRefsAlreadyExistsError, PidRefsContentError, PidRefsFileNotFound, ) # pylint: disable=W0212 -# TODO: Review 'tag_object' tests and split them into relevant 'store_hashstore_refs_files' tests - def test_tag_object(pids, store): """Test tag_object does not throw exception when successful.""" @@ -29,88 +25,6 @@ def test_tag_object(pids, store): assert store._count("cid") == 3 -def test_tag_object_pid_refs_file_exists(pids, store): - """Test tag_object creates the expected pid reference file.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) - assert os.path.exists(pid_refs_file_path) - - -def test_tag_object_cid_refs_file_exists(pids, store): - """Test tag_object creates the cid reference file.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) - assert os.path.exists(cid_refs_file_path) - - -def test_tag_object_pid_refs_file_content(pids, store): - """Test tag_object created the pid reference file with the expected cid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) - with open(pid_refs_file_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - assert pid_refs_cid == object_metadata.cid - - -def test_tag_object_cid_refs_file_content(pids, store): - """Test tag_object creates the cid reference file successfully with pid tagged.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store._get_hashstore_cid_refs_path(object_metadata.cid) - with open(cid_refs_file_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read().strip() - assert pid_refs_cid == pid - - -def test_tag_object_pid_refs_found_cid_refs_found(pids, store): - """Test tag_object does not throw an exception when any refs file already exists - and verifies the content, and does not double tag the cid refs file.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - store.tag_object(pid, cid) - - with pytest.raises(HashStoreRefsAlreadyExists): - store.tag_object(pid, cid) - - cid_refs_file_path = store._get_hashstore_cid_refs_path(object_metadata.cid) - line_count = 0 - with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: - for _line in ref_file: - line_count += 1 - assert line_count == 1 - - -def test_tag_object_pid_refs_found_cid_refs_not_found(store): - """Test that tag_object throws an exception when pid refs file exists, contains a - different cid, and is correctly referenced in the associated cid refs file""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) - - with pytest.raises(PidRefsAlreadyExistsError): - store.tag_object(pid, "another_cid_value_that_is_not_found") - - def test_tag_object_pid_refs_not_found_cid_refs_found(store): """Test tag_object updates a cid reference file that already exists.""" test_dir = "tests/testdata/" @@ -138,6 +52,10 @@ def test_tag_object_pid_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 +# TODO: Add tag_ojbect test for HashStoreRefsAlreadyExists +# TODO: Add tag_ojbect test for PidRefsAlreadyExistsError + + def test_write_refs_file_ref_type_cid(store): """Test that write_refs_file writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" From da2e82ced8f61ff2e0a0b742cc64cfbe09d6f4ee Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 15:05:54 -0700 Subject: [PATCH 352/420] Extract 'tag_object' pytests and relevant todo items from 'references' test module into 'interface' test module --- tests/test_filehashstore_interface.py | 42 ++++++++++++++++++++++++++ tests/test_filehashstore_references.py | 42 -------------------------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index a2dba6dc..3ada26eb 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -718,6 +718,48 @@ def test_store_object_sparse_large_file(store): assert object_metadata_id == object_metadata.hex_digests.get("sha256") +def test_tag_object(pids, store): + """Test tag_object does not throw exception when successful.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + store.tag_object(pid, object_metadata.cid) + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + +def test_tag_object_pid_refs_not_found_cid_refs_found(store): + """Test tag_object updates a cid reference file that already exists.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + # Store data only + object_metadata = store.store_object(None, path) + cid = object_metadata.cid + # Tag object + store.tag_object(pid, cid) + # Tag the cid with another pid + additional_pid = "dou.test.1" + store.tag_object(additional_pid, cid) + + # Read cid file to confirm cid refs file contains the additional pid + line_count = 0 + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + line_count += 1 + assert value == pid or value == additional_pid + assert line_count == 2 + assert store._count("pid") == 2 + assert store._count("cid") == 1 + + +# TODO: Add tag_ojbect test for HashStoreRefsAlreadyExists +# TODO: Add tag_ojbect test for PidRefsAlreadyExistsError + + def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 7319758d..4dd9c419 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -14,48 +14,6 @@ # pylint: disable=W0212 -def test_tag_object(pids, store): - """Test tag_object does not throw exception when successful.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) - assert store._count("pid") == 3 - assert store._count("cid") == 3 - - -def test_tag_object_pid_refs_not_found_cid_refs_found(store): - """Test tag_object updates a cid reference file that already exists.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - # Store data only - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - # Tag object - store.tag_object(pid, cid) - # Tag the cid with another pid - additional_pid = "dou.test.1" - store.tag_object(additional_pid, cid) - - # Read cid file to confirm cid refs file contains the additional pid - line_count = 0 - cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - line_count += 1 - assert value == pid or value == additional_pid - assert line_count == 2 - assert store._count("pid") == 2 - assert store._count("cid") == 1 - - -# TODO: Add tag_ojbect test for HashStoreRefsAlreadyExists -# TODO: Add tag_ojbect test for PidRefsAlreadyExistsError - - def test_write_refs_file_ref_type_cid(store): """Test that write_refs_file writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" From a9c2f17d8fd3b050c060eff00bb56ae2de798a76 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 15:10:04 -0700 Subject: [PATCH 353/420] Extract remaining pytests from 'references' test module into 'filehashstore' test module and delete redundant 'references' test module --- tests/test_filehashstore.py | 228 ++++++++++++++++++++++++ tests/test_filehashstore_references.py | 237 ------------------------- 2 files changed, 228 insertions(+), 237 deletions(-) delete mode 100644 tests/test_filehashstore_references.py diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 9f36bebc..0bd1d67c 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -2,6 +2,7 @@ import io import os +import shutil from pathlib import Path import pytest from hashstore.filehashstore import FileHashStore, ObjectMetadata @@ -15,6 +16,10 @@ UnsupportedAlgorithm, HashStoreRefsAlreadyExists, PidRefsAlreadyExistsError, + CidRefsContentError, + CidRefsFileNotFound, + PidRefsContentError, + PidRefsFileNotFound, ) @@ -728,6 +733,133 @@ def test_store_hashstore_refs_files_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 +def test_write_refs_file_ref_type_cid(store): + """Test that write_refs_file writes a reference file.""" + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "test_pid", "cid") + assert os.path.exists(tmp_cid_refs_file) + + +def test_write_refs_file_ref_type_cid_content(pids, store): + """Test that write_refs_file writes the expected content.""" + for pid in pids.keys(): + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + with open(tmp_cid_refs_file, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert pid == cid_ref_file_pid.strip() + + +def test_write_refs_file_ref_type_pid(pids, store): + """Test that write_pid_refs_file writes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") + assert os.path.exists(tmp_pid_refs_file) + + +def test_write_refs_file_ref_type_content_pid(pids, store): + """Test that write_pid_refs_file writes the expected content.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") + with open(tmp_pid_refs_file, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + assert cid == pid_refs_cid + + +def test_update_refs_file_content(pids, store): + """Test that update_refs_file updates the ref file as expected.""" + for pid in pids.keys(): + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + pid_other = "dou.test.1" + store._update_refs_file(tmp_cid_refs_file, pid_other, "add") + + with open(tmp_cid_refs_file, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == pid_other + + +def test_update_refs_file_content_multiple(pids, store): + """Test that _update_refs_file adds multiple references successfully.""" + for pid in pids.keys(): + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + + cid_reference_list = [pid] + for i in range(0, 5): + store._update_refs_file(tmp_cid_refs_file, f"dou.test.{i}", "add") + cid_reference_list.append(f"dou.test.{i}") + + line_count = 0 + with open(tmp_cid_refs_file, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + line_count += 1 + value = line.strip() + assert value in cid_reference_list + + assert line_count == 6 + + +def test_update_refs_file_content_pid_exists(pids, store): + """Test that _update_refs_file does add a pid to a refs file that already + contains the pid.""" + for pid in pids.keys(): + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + # Exception should not be thrown + store._update_refs_file(tmp_cid_refs_file, pid, "add") + + line_count = 0 + with open(tmp_cid_refs_file, "r", encoding="utf8") as ref_file: + for _line in ref_file: + line_count += 1 + assert line_count == 1 + + +def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): + """Test that _update_refs_file throws exception if refs file doesn't exist.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) + with pytest.raises(FileNotFoundError): + store._update_refs_file(cid_ref_abs_path, pid, "add") + + +def test_update_refs_file_remove(pids, store): + """Test that _update_refs_file deletes the given pid from the ref file.""" + for pid in pids.keys(): + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + + pid_other = "dou.test.1" + store._update_refs_file(tmp_cid_refs_file, pid_other, "add") + store._update_refs_file(tmp_cid_refs_file, pid, "remove") + + with open(tmp_cid_refs_file, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid_other + + +def test_update_refs_file_empty_file(pids, store): + """Test that _update_refs_file leaves a file empty when removing the last pid.""" + for pid in pids.keys(): + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + # First remove the pid + store._update_refs_file(tmp_cid_refs_file, pid, "remove") + + assert os.path.exists(tmp_cid_refs_file) + assert os.path.getsize(tmp_cid_refs_file) == 0 + + def test_put_metadata_with_path(pids, store): """Test _put_metadata with path object for the path arg.""" entity = "metadata" @@ -917,6 +1049,102 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( ) +def test_verify_hashstore_references_pid_refs_file_missing(pids, store): + """Test _verify_hashstore_references throws exception when pid refs file is missing.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + with pytest.raises(PidRefsFileNotFound): + store._verify_hashstore_references(pid, cid) + + +def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): + """Test _verify_hashstore_references throws exception when pid refs file cid is incorrect.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + # Write the cid refs file and move it where it needs to be + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) + print(cid_ref_abs_path) + store._create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(tmp_cid_refs_file, cid_ref_abs_path) + # Write the pid refs file and move it where it needs to be with a bad cid + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) + print(pid_ref_abs_path) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + + with pytest.raises(PidRefsContentError): + store._verify_hashstore_references(pid, cid) + + +def test_verify_hashstore_references_cid_refs_file_missing(pids, store): + """Test _verify_hashstore_references throws exception when cid refs file is missing.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + + with pytest.raises(CidRefsFileNotFound): + store._verify_hashstore_references(pid, cid) + + +def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): + """Test _verify_hashstore_references throws exception when cid refs file does not contain + the expected pid.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + # Get a tmp cid refs file and write the wrong pid into it + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) + store._create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(tmp_cid_refs_file, cid_ref_abs_path) + # Now write the pid refs file, both cid and pid refs must be present + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + + with pytest.raises(CidRefsContentError): + store._verify_hashstore_references(pid, cid) + + +def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pid( + pids, store +): + """Test _verify_hashstore_references throws exception when cid refs file with multiple + references does not contain the expected pid.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + # Write the wrong pid into a cid refs file and move it where it needs to be + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") + cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) + store._create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(tmp_cid_refs_file, cid_ref_abs_path) + # Now write the pid refs with expected values + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + + cid_reference_list = [pid] + for i in range(0, 5): + store._update_refs_file(cid_ref_abs_path, f"dou.test.{i}", "add") + cid_reference_list.append(f"dou.test.{i}") + + with pytest.raises(CidRefsContentError): + store._verify_hashstore_references(pid, cid) + + def test_find_object(pids, store): """Test _find_object returns the correct content.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py deleted file mode 100644 index 4dd9c419..00000000 --- a/tests/test_filehashstore_references.py +++ /dev/null @@ -1,237 +0,0 @@ -"""Test module for FileHashStore's reference system to tag stored objects.""" - -import os -import shutil -import pytest - -from hashstore.filehashstore_exceptions import ( - CidRefsContentError, - CidRefsFileNotFound, - PidRefsContentError, - PidRefsFileNotFound, -) - -# pylint: disable=W0212 - - -def test_write_refs_file_ref_type_cid(store): - """Test that write_refs_file writes a reference file.""" - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "test_pid", "cid") - assert os.path.exists(tmp_cid_refs_file) - - -def test_write_refs_file_ref_type_cid_content(pids, store): - """Test that write_refs_file writes the expected content.""" - for pid in pids.keys(): - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") - with open(tmp_cid_refs_file, "r", encoding="utf8") as f: - cid_ref_file_pid = f.read() - - assert pid == cid_ref_file_pid.strip() - - -def test_update_refs_file_content(pids, store): - """Test that update_refs_file updates the ref file as expected.""" - for pid in pids.keys(): - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") - pid_other = "dou.test.1" - store._update_refs_file(tmp_cid_refs_file, pid_other, "add") - - with open(tmp_cid_refs_file, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - assert value == pid or value == pid_other - - -def test_update_refs_file_content_multiple(pids, store): - """Test that _update_refs_file adds multiple references successfully.""" - for pid in pids.keys(): - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") - - cid_reference_list = [pid] - for i in range(0, 5): - store._update_refs_file(tmp_cid_refs_file, f"dou.test.{i}", "add") - cid_reference_list.append(f"dou.test.{i}") - - line_count = 0 - with open(tmp_cid_refs_file, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - line_count += 1 - value = line.strip() - assert value in cid_reference_list - - assert line_count == 6 - - -def test_update_refs_file_content_pid_exists(pids, store): - """Test that _update_refs_file does add a pid to a refs file that already - contains the pid.""" - for pid in pids.keys(): - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") - # Exception should not be thrown - store._update_refs_file(tmp_cid_refs_file, pid, "add") - - line_count = 0 - with open(tmp_cid_refs_file, "r", encoding="utf8") as ref_file: - for _line in ref_file: - line_count += 1 - assert line_count == 1 - - -def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): - """Test that _update_refs_file throws exception if refs file doesn't exist.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - with pytest.raises(FileNotFoundError): - store._update_refs_file(cid_ref_abs_path, pid, "add") - - -def test_update_refs_file_remove(pids, store): - """Test that _update_refs_file deletes the given pid from the ref file.""" - for pid in pids.keys(): - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") - - pid_other = "dou.test.1" - store._update_refs_file(tmp_cid_refs_file, pid_other, "add") - store._update_refs_file(tmp_cid_refs_file, pid, "remove") - - with open(tmp_cid_refs_file, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - assert value == pid_other - - -def test_update_refs_file_empty_file(pids, store): - """Test that _update_refs_file leaves a file empty when removing the last pid.""" - for pid in pids.keys(): - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") - # First remove the pid - store._update_refs_file(tmp_cid_refs_file, pid, "remove") - - assert os.path.exists(tmp_cid_refs_file) - assert os.path.getsize(tmp_cid_refs_file) == 0 - - -def test_write_refs_file_ref_type_pid(pids, store): - """Test that write_pid_refs_file writes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") - assert os.path.exists(tmp_pid_refs_file) - - -def test_write_refs_file_ref_type_content_pid(pids, store): - """Test that write_pid_refs_file writes the expected content.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") - with open(tmp_pid_refs_file, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - - assert cid == pid_refs_cid - - -def test_verify_hashstore_references_pid_refs_file_missing(pids, store): - """Test _verify_hashstore_references throws exception when pid refs file is missing.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - with pytest.raises(PidRefsFileNotFound): - store._verify_hashstore_references(pid, cid) - - -def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): - """Test _verify_hashstore_references throws exception when pid refs file cid is incorrect.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - # Write the cid refs file and move it where it needs to be - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") - cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - print(cid_ref_abs_path) - store._create_path(os.path.dirname(cid_ref_abs_path)) - shutil.move(tmp_cid_refs_file, cid_ref_abs_path) - # Write the pid refs file and move it where it needs to be with a bad cid - pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) - print(pid_ref_abs_path) - store._create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") - shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - - with pytest.raises(PidRefsContentError): - store._verify_hashstore_references(pid, cid) - - -def test_verify_hashstore_references_cid_refs_file_missing(pids, store): - """Test _verify_hashstore_references throws exception when cid refs file is missing.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) - store._create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") - shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - - with pytest.raises(CidRefsFileNotFound): - store._verify_hashstore_references(pid, cid) - - -def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): - """Test _verify_hashstore_references throws exception when cid refs file does not contain - the expected pid.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - # Get a tmp cid refs file and write the wrong pid into it - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") - cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - store._create_path(os.path.dirname(cid_ref_abs_path)) - shutil.move(tmp_cid_refs_file, cid_ref_abs_path) - # Now write the pid refs file, both cid and pid refs must be present - pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) - store._create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") - shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - - with pytest.raises(CidRefsContentError): - store._verify_hashstore_references(pid, cid) - - -def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pid( - pids, store -): - """Test _verify_hashstore_references throws exception when cid refs file with multiple - references does not contain the expected pid.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - # Write the wrong pid into a cid refs file and move it where it needs to be - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") - cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - store._create_path(os.path.dirname(cid_ref_abs_path)) - shutil.move(tmp_cid_refs_file, cid_ref_abs_path) - # Now write the pid refs with expected values - pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) - store._create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") - shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - - cid_reference_list = [pid] - for i in range(0, 5): - store._update_refs_file(cid_ref_abs_path, f"dou.test.{i}", "add") - cid_reference_list.append(f"dou.test.{i}") - - with pytest.raises(CidRefsContentError): - store._verify_hashstore_references(pid, cid) From 341417a6382e6fb39de4ec504ed1c5832ff8b380 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 15:12:51 -0700 Subject: [PATCH 354/420] Extract pytests from 'stream' test module into 'filehashstore' test module and delete redundant 'stream' test module --- tests/test_filehashstore.py | 53 +++++++++++++++++++++++++++- tests/test_filehashstore_stream.py | 56 ------------------------------ 2 files changed, 52 insertions(+), 57 deletions(-) delete mode 100644 tests/test_filehashstore_stream.py diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 0bd1d67c..a45a0201 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -2,10 +2,11 @@ import io import os +import hashlib import shutil from pathlib import Path import pytest -from hashstore.filehashstore import FileHashStore, ObjectMetadata +from hashstore.filehashstore import FileHashStore, ObjectMetadata, Stream from hashstore.filehashstore_exceptions import ( OrphanPidRefsFileFound, NonMatchingChecksum, @@ -1578,6 +1579,56 @@ def test_check_string(store): store._check_string(tab_line, "tab_line") +def test_stream_reads_file(pids): + """Test that a stream can read a file and yield its contents.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path_string = test_dir + pid.replace("/", "_") + obj_stream = Stream(path_string) + hashobj = hashlib.new("sha256") + for data in obj_stream: + hashobj.update(data) + obj_stream.close() + hex_digest = hashobj.hexdigest() + assert pids[pid]["sha256"] == hex_digest + + +def test_stream_reads_path_object(pids): + """Test that a stream can read a file-like object and yield its contents.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + obj_stream = Stream(path) + hashobj = hashlib.new("sha256") + for data in obj_stream: + hashobj.update(data) + obj_stream.close() + hex_digest = hashobj.hexdigest() + assert pids[pid]["sha256"] == hex_digest + + +def test_stream_returns_to_original_position_on_close(pids): + """Test that a stream returns to its original position after closing the file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path_string = test_dir + pid.replace("/", "_") + input_stream = io.open(path_string, "rb") + input_stream.seek(5) + hashobj = hashlib.new("sha256") + obj_stream = Stream(input_stream) + for data in obj_stream: + hashobj.update(data) + obj_stream.close() + assert input_stream.tell() == 5 + input_stream.close() + + +def test_stream_raises_error_for_invalid_object(): + """Test that a stream raises ValueError for an invalid input object.""" + with pytest.raises(ValueError): + Stream(1234) + + def test_objectmetadata(): """Test ObjectMetadata class returns correct values via dot notation.""" pid = "hashstore" diff --git a/tests/test_filehashstore_stream.py b/tests/test_filehashstore_stream.py deleted file mode 100644 index 29fa4d20..00000000 --- a/tests/test_filehashstore_stream.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Test module for FileHashStore's Stream class.""" -import hashlib -import io -from pathlib import Path -import pytest -from hashstore.filehashstore import Stream - - -def test_stream_reads_file(pids): - """Test that a stream can read a file and yield its contents.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path_string = test_dir + pid.replace("/", "_") - obj_stream = Stream(path_string) - hashobj = hashlib.new("sha256") - for data in obj_stream: - hashobj.update(data) - obj_stream.close() - hex_digest = hashobj.hexdigest() - assert pids[pid]["sha256"] == hex_digest - - -def test_stream_reads_path_object(pids): - """Test that a stream can read a file-like object and yield its contents.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - obj_stream = Stream(path) - hashobj = hashlib.new("sha256") - for data in obj_stream: - hashobj.update(data) - obj_stream.close() - hex_digest = hashobj.hexdigest() - assert pids[pid]["sha256"] == hex_digest - - -def test_stream_returns_to_original_position_on_close(pids): - """Test that a stream returns to its original position after closing the file.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path_string = test_dir + pid.replace("/", "_") - input_stream = io.open(path_string, "rb") - input_stream.seek(5) - hashobj = hashlib.new("sha256") - obj_stream = Stream(input_stream) - for data in obj_stream: - hashobj.update(data) - obj_stream.close() - assert input_stream.tell() == 5 - input_stream.close() - - -def test_stream_raises_error_for_invalid_object(): - """Test that a stream raises ValueError for an invalid input object.""" - with pytest.raises(ValueError): - Stream(1234) From 3e482fcc8e70346b5c41dfce4b5d7c9b951e425b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 15:29:28 -0700 Subject: [PATCH 355/420] Rename pytests related to retrieving hashstore entity paths for accuracy and revise docstrings for accuracy --- src/hashstore/filehashstore.py | 5 +++-- tests/test_filehashstore.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index aa39261a..77360f54 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2541,7 +2541,7 @@ def _build_hashstore_data_object_path(self, hash_id): def _get_hashstore_data_object_path(self, cid_or_relative_path): """Get the expected path to a hashstore data object that exists using a content identifier. - :param str cid_or_relative_path: Content identifier + :param str cid_or_relative_path: Content identifier or relative path in '/objects' to check :return: Path to the data object referenced by the pid :rtype: Path @@ -2570,7 +2570,8 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path): def _get_hashstore_metadata_path(self, metadata_relative_path): """Return the expected metadata path to a hashstore metadata object that exists. - :param str metadata_relative_path: Metadata path to check + :param str metadata_relative_path: Metadata path to check or relative path in + '/metadata' to check :return: Path to the data object referenced by the pid :rtype: Path diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a45a0201..7a27c781 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1487,7 +1487,7 @@ def test_cast_to_bytes(store): assert isinstance(string_bytes, bytes) -def test_resolve_path_objects(pids, store): +def test_get_hashstore_data_object_path(pids, store): """Confirm resolve path returns correct object path""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -1501,7 +1501,7 @@ def test_resolve_path_objects(pids, store): assert calculated_obj_path == obj_resolved_path -def test_resolve_path_metadata(pids, store): +def test_get_hashstore_metadata_path_metadata(pids, store): """Confirm resolve path returns correct metadata path.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" From efa0989449c42e87e9183bbdfe948d2599e8bb4c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 15:48:22 -0700 Subject: [PATCH 356/420] Resolve todo item in '_store_hashstore_refs_files' to call '_untag_object' --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 77360f54..29d82d84 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1708,9 +1708,9 @@ def _store_hashstore_refs_files(self, pid, cid): raise expected_exceptions except Exception as unexpected_exception: - # TODO: Untagobject # For all other unexpected exceptions, we are to revert the tagging process as # much as possible. No exceptions from the reverting process will be thrown. + self._untag_object(pid, cid) raise unexpected_exception finally: From 440b3efb2a2335302af969237f335d50bcc8dcb1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 15:48:58 -0700 Subject: [PATCH 357/420] Add pytests for '_validate_and_check_cid_lock' function --- tests/test_filehashstore.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 7a27c781..353a23c6 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -21,6 +21,7 @@ CidRefsFileNotFound, PidRefsContentError, PidRefsFileNotFound, + IdentifierNotLocked, ) @@ -734,6 +735,25 @@ def test_store_hashstore_refs_files_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 +def test_validate_and_check_cid_lock_non_matching_cid(store): + """Test that _validate_and_check_cid_lock throws exception when cid is different""" + pid = "dou.test.1" + cid = "thegoodcid" + cid_to_check = "thebadcid" + + with pytest.raises(ValueError): + store._validate_and_check_cid_lock(pid, cid, cid_to_check) + + +def test_validate_and_check_cid_lock_identifier_not_locked(store): + """Test that _validate_and_check_cid_lock throws exception when cid is not locked""" + pid = "dou.test.1" + cid = "thegoodcid" + + with pytest.raises(IdentifierNotLocked): + store._validate_and_check_cid_lock(pid, cid, cid) + + def test_write_refs_file_ref_type_cid(store): """Test that write_refs_file writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" From be27311196472f51fe4112519799fba82b967ab8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 16:16:50 -0700 Subject: [PATCH 358/420] Extract '_mark_pid_refs_file_for_deletion' function (which swallows all exceptions when attempting to rename a pid refs file) from '_untag_object' and add pytest --- src/hashstore/filehashstore.py | 22 +++++++++++++++++++--- tests/test_filehashstore.py | 15 +++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 29d82d84..47ddf97f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1743,13 +1743,13 @@ def _untag_object(self, pid, cid): obj_info_dict = self._find_object(pid) cid_to_check = obj_info_dict["cid"] self._validate_and_check_cid_lock(pid, cid, cid_to_check) + # Remove pid refs pid_refs_path_str = obj_info_dict.get("pid_refs_path") - untag_obj_delete_list.append( - self._rename_path_for_deletion(pid_refs_path_str) + self._mark_pid_refs_file_for_deletion( + pid, untag_obj_delete_list, pid_refs_path_str ) # Remove pid from cid refs - # TODO # Remove all files confirmed for deletion for obj in untag_obj_delete_list: os.remove(obj) @@ -1767,6 +1767,22 @@ def _untag_object(self, pid, cid): # TODO: Handle cid refs to ensure pid not found in it return + def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): + """Attempt to rename a pid refs file and add the renamed file to a provided list. + + :param str pid: Persistent or authority-based identifier. + :param list delete_list: List to add the renamed pid refs file marked for deletion to + :param str pid_refs_path: Path to the pid reference file + """ + try: + delete_list.append(self._rename_path_for_deletion(pid_refs_path)) + except Exception as e: + err_msg = ( + f"Unable to delete pid refs file: {pid_refs_path} for pid: {pid}. " + + str(e) + ) + logging.error(err_msg) + def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): """Confirm that the two content identifiers provided are equal and is locked to ensure thread safety. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 353a23c6..b38edb2a 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -735,6 +735,21 @@ def test_store_hashstore_refs_files_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 +def test_mark_pid_refs_file_for_deletion(store): + """Test _mark_pid_refs_file_for_deletion renames a given path for deletion (adds '_delete' to + the path name) and adds it to the given list.""" + pid = "dou.test.1" + cid = "agoodcid" + list_to_check = [] + store._store_hashstore_refs_files(pid, cid) + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + + store._mark_pid_refs_file_for_deletion(pid, list_to_check, pid_refs_path) + + assert len(list_to_check) == 1 + assert "_delete" in str(list_to_check[0]) + + def test_validate_and_check_cid_lock_non_matching_cid(store): """Test that _validate_and_check_cid_lock throws exception when cid is different""" pid = "dou.test.1" From 5ba1228ec5b506fff49742932eee139abe58b280 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 16:49:30 -0700 Subject: [PATCH 359/420] Add new function '_remove_pid_and_handle_cid_refs_deletion' with pytests, and update '_untag_object' --- src/hashstore/filehashstore.py | 33 +++++++++++++++++++++++++++++--- tests/test_filehashstore.py | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 47ddf97f..b1a030c5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1745,11 +1745,15 @@ def _untag_object(self, pid, cid): self._validate_and_check_cid_lock(pid, cid, cid_to_check) # Remove pid refs - pid_refs_path_str = obj_info_dict.get("pid_refs_path") + pid_refs_path = self._get_hashstore_pid_refs_path(pid) self._mark_pid_refs_file_for_deletion( - pid, untag_obj_delete_list, pid_refs_path_str + pid, untag_obj_delete_list, pid_refs_path ) # Remove pid from cid refs + cid_refs_path = self._get_hashstore_cid_refs_path(cid) + self._remove_pid_and_handle_cid_refs_deletion( + pid, untag_obj_delete_list, cid_refs_path + ) # Remove all files confirmed for deletion for obj in untag_obj_delete_list: os.remove(obj) @@ -1772,7 +1776,7 @@ def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): :param str pid: Persistent or authority-based identifier. :param list delete_list: List to add the renamed pid refs file marked for deletion to - :param str pid_refs_path: Path to the pid reference file + :param path pid_refs_path: Path to the pid reference file """ try: delete_list.append(self._rename_path_for_deletion(pid_refs_path)) @@ -1783,6 +1787,29 @@ def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): ) logging.error(err_msg) + def _remove_pid_and_handle_cid_refs_deletion(self, pid, delete_list, cid_refs_path): + """Attempt to remove a pid from a 'cid refs file' and add the 'cid refs file' to the + delete list if it is empty. + + :param str pid: Persistent or authority-based identifier. + :param list delete_list: List to add the renamed pid refs file marked for deletion to + :param path cid_refs_path: Path to the pid reference file + """ + try: + # Remove pid from cid reference file + self._update_refs_file(cid_refs_path, pid, "remove") + # Delete cid reference file and object only if the cid refs file is empty + if os.path.getsize(cid_refs_path) == 0: + delete_list.append(self._rename_path_for_deletion(cid_refs_path)) + return + except Exception as e: + err_msg = ( + f"Unable to delete remove pid from cid refs file: {cid_refs_path} for pid:" + f" {pid}. " + str(e) + ) + logging.error(err_msg) + return + def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): """Confirm that the two content identifiers provided are equal and is locked to ensure thread safety. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index b38edb2a..4006ad7d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -750,6 +750,41 @@ def test_mark_pid_refs_file_for_deletion(store): assert "_delete" in str(list_to_check[0]) +def test_remove_pid_and_handle_cid_refs_deletion_multiple_cid_refs_contains_multi_pids( + store, +): + """Test _remove_pid_and_handle_cid_refs_deletion removes a pid from the cid refs file.""" + pid = "dou.test.1" + pid_two = "dou.test.2" + cid = "agoodcid" + list_to_check = [] + store._store_hashstore_refs_files(pid, cid) + store._store_hashstore_refs_files(pid_two, cid) + + cid_refs_path = store._get_hashstore_cid_refs_path(cid) + store._remove_pid_and_handle_cid_refs_deletion(pid, list_to_check, cid_refs_path) + + assert store._is_string_in_refs_file(pid, cid_refs_path) is False + assert store._count("cid") == 1 + assert len(list_to_check) == 0 + + +def test_remove_pid_and_handle_cid_refs_deletion_cid_refs_empty(store): + """Test _remove_pid_and_handle_cid_refs_deletion removes a pid from the cid refs file and + deletes it when it is empty after removal.""" + pid = "dou.test.1" + cid = "agoodcid" + list_to_check = [] + store._store_hashstore_refs_files(pid, cid) + + cid_refs_path = store._get_hashstore_cid_refs_path(cid) + store._remove_pid_and_handle_cid_refs_deletion(pid, list_to_check, cid_refs_path) + + assert not os.path.exists(cid_refs_path) + assert os.path.exists(cid_refs_path + "_delete") + assert len(list_to_check) == 1 + + def test_validate_and_check_cid_lock_non_matching_cid(store): """Test that _validate_and_check_cid_lock throws exception when cid is different""" pid = "dou.test.1" From 0c28e4e2d455ee4be486e63b930fb5d553fcc2a6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 16 Sep 2024 16:55:40 -0700 Subject: [PATCH 360/420] Remove redundant return statements --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b1a030c5..2919f839 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1780,6 +1780,7 @@ def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): """ try: delete_list.append(self._rename_path_for_deletion(pid_refs_path)) + except Exception as e: err_msg = ( f"Unable to delete pid refs file: {pid_refs_path} for pid: {pid}. " @@ -1801,14 +1802,13 @@ def _remove_pid_and_handle_cid_refs_deletion(self, pid, delete_list, cid_refs_pa # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_refs_path) == 0: delete_list.append(self._rename_path_for_deletion(cid_refs_path)) - return + except Exception as e: err_msg = ( f"Unable to delete remove pid from cid refs file: {cid_refs_path} for pid:" f" {pid}. " + str(e) ) logging.error(err_msg) - return def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): """Confirm that the two content identifiers provided are equal and is locked to ensure From 766d21d1d0dcb4666b25a8c079083230fc545e8b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 09:04:43 -0700 Subject: [PATCH 361/420] Create new function '_delete_marked_files', refactor FileHashStore relevant code and add new pytests --- src/hashstore/filehashstore.py | 41 +++++++++++++++++++++------------- tests/test_filehashstore.py | 34 ++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2919f839..bea66334 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -870,8 +870,7 @@ def delete_object(self, pid): self._rename_path_for_deletion(obj_real_path) ) # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) + self._delete_marked_files(objects_to_delete) # Remove metadata files if they exist self.delete_metadata(pid) @@ -912,8 +911,7 @@ def delete_object(self, pid): self.delete_metadata(pid) # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) + self._delete_marked_files(objects_to_delete) return except OrphanPidRefsFileFound: # Delete pid refs file @@ -924,8 +922,7 @@ def delete_object(self, pid): # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) + self._delete_marked_files(objects_to_delete) return except RefsFileExistsButCidObjMissing: # Add pid refs file to be permanently deleted @@ -944,8 +941,7 @@ def delete_object(self, pid): # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) + self._delete_marked_files(objects_to_delete) return except PidNotFoundInCidRefsFile: # Add pid refs file to be permanently deleted @@ -956,8 +952,7 @@ def delete_object(self, pid): # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) + self._delete_marked_files(objects_to_delete) return finally: # Release pid @@ -1045,8 +1040,7 @@ def delete_metadata(self, pid, format_id=None): self.metadata_condition_th.notify() # Delete metadata objects - for obj in objects_to_delete: - os.remove(obj) + self._delete_marked_files(objects_to_delete) info_string = ( "FileHashStore - delete_metadata: Successfully deleted all metadata" + f"for pid: {pid}", @@ -1721,7 +1715,7 @@ def _store_hashstore_refs_files(self, pid, cid): def _untag_object(self, pid, cid): """Untags a data object in HashStore by deleting the 'pid reference file' and removing the 'pid' from the 'cid reference file'. This method will never delete a data - object. _untag_object will attempt to proceed with as much of the untagging process as + object. `_untag_object` will attempt to proceed with as much of the untagging process as possible and swallow relevant exceptions. :param str cid: Content identifier @@ -1755,9 +1749,8 @@ def _untag_object(self, pid, cid): pid, untag_obj_delete_list, cid_refs_path ) # Remove all files confirmed for deletion - for obj in untag_obj_delete_list: - os.remove(obj) - return + self._delete_marked_files(untag_obj_delete_list) + except OrphanPidRefsFileFound as oprff: # TODO: Handle orphan pid refs return @@ -1771,6 +1764,22 @@ def _untag_object(self, pid, cid): # TODO: Handle cid refs to ensure pid not found in it return + @staticmethod + def _delete_marked_files(delete_list): + """Delete all the file paths in a given delete list. + + :param list delete_list: Persistent or authority-based identifier. + """ + if delete_list is not None: + for obj in delete_list: + try: + os.remove(obj) + except Exception as e: + warn_msg = f"Unable to remove {obj} in given delete_list. " + str(e) + logging.warning(warn_msg) + else: + raise ValueError("delete_marked_files: list cannot be None") + def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): """Attempt to rename a pid refs file and add the renamed file to a provided list. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 4006ad7d..d3df7519 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -735,6 +735,40 @@ def test_store_hashstore_refs_files_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 +def test_delete_marked_files(store): + """Test that _delete_marked_files removes all items from a given list""" + pid = "jtao.1700.1" + cid = "94f9b6c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a" + # Tag object + store._store_hashstore_refs_files(pid, cid) + # Tag the cid with another pid + additional_pid = "dou.test.1" + store._store_hashstore_refs_files(additional_pid, cid) + + list_to_check = [] + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + store._mark_pid_refs_file_for_deletion(pid, list_to_check, pid_refs_path) + pid_refs_path_two = store._get_hashstore_pid_refs_path(additional_pid) + store._mark_pid_refs_file_for_deletion(pid, list_to_check, pid_refs_path_two) + + assert len(list_to_check) == 2 + + store._delete_marked_files(list_to_check) + + assert not os.path.exists(list_to_check[0]) + assert not os.path.exists(list_to_check[1]) + + +def test_delete_marked_files_empty_list_or_none(store): + """Test that _delete_marked_files throws exception when supplied 'None' value - and does not + throw any exception when provided with an empty list.""" + list_to_check = [] + store._delete_marked_files(list_to_check) + + with pytest.raises(ValueError): + store._delete_marked_files(None) + + def test_mark_pid_refs_file_for_deletion(store): """Test _mark_pid_refs_file_for_deletion renames a given path for deletion (adds '_delete' to the path name) and adds it to the given list.""" From 8ed6638a8c7844727005b0855a1e78374af72246 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 13:17:06 -0700 Subject: [PATCH 362/420] Re-organize functions in 'filehashstore' --- src/hashstore/filehashstore.py | 268 ++++++++++++++++----------------- 1 file changed, 134 insertions(+), 134 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bea66334..71db4ce1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1764,6 +1764,93 @@ def _untag_object(self, pid, cid): # TODO: Handle cid refs to ensure pid not found in it return + def _put_metadata(self, metadata, pid, metadata_doc_name): + """Store contents of metadata to `[self.root]/metadata` using the hash of the + given PID and format ID as the permanent address. + + :param mixed metadata: String or path to metadata document. + :param str pid: Authority-based identifier. + :param str metadata_doc_name: Metadata document name + + :return: Address of the metadata document. + :rtype: Path + """ + logging.debug( + "FileHashStore - _put_metadata: Request to put metadata for pid: %s", pid + ) + # Create metadata tmp file and write to it + metadata_stream = Stream(metadata) + with closing(metadata_stream): + metadata_tmp = self._mktmpmetadata(metadata_stream) + + # Get target and related paths (permanent location) + metadata_directory = self._computehash(pid) + metadata_document_name = metadata_doc_name + rel_path = "/".join(self._shard(metadata_directory)) + full_path = self._get_store_path("metadata") / rel_path / metadata_document_name + + # Move metadata to target path + if os.path.exists(metadata_tmp): + try: + parent = full_path.parent + parent.mkdir(parents=True, exist_ok=True) + # Metadata will be replaced if it exists + shutil.move(metadata_tmp, full_path) + logging.debug( + "FileHashStore - _put_metadata: Successfully put metadata for pid: %s", + pid, + ) + return full_path + except Exception as err: + exception_string = ( + f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + if os.path.exists(metadata_tmp): + # Remove tmp metadata, calling app must re-upload + logging.debug( + "FileHashStore - _put_metadata: Deleting metadata for pid: %s", + pid, + ) + self.metadata.delete(metadata_tmp) + raise + else: + exception_string = ( + f"FileHashStore - _put_metadata: Attempt to move metadata for pid: {pid}" + + f", but metadata temp file not found: {metadata_tmp}" + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) + + def _mktmpmetadata(self, stream): + """Create a named temporary file with `stream` (metadata). + + :param Stream stream: Metadata stream. + + :return: Path/name of temporary file created and written into. + :rtype: str + """ + # Create temporary file in .../{store_path}/tmp + tmp_root_path = self._get_store_path("metadata") / "tmp" + tmp = self._mktmpfile(tmp_root_path) + + # tmp is a file-like object that is already opened for writing by default + logging.debug( + "FileHashStore - _mktmpmetadata: Writing stream to tmp metadata file: %s", + tmp.name, + ) + with tmp as tmp_file: + for data in stream: + tmp_file.write(self._cast_to_bytes(data)) + + logging.debug( + "FileHashStore - _mktmpmetadata: Successfully written to tmp metadata file: %s", + tmp.name, + ) + return tmp.name + + # FileHashStore Utility & Supporting Methods + @staticmethod def _delete_marked_files(delete_list): """Delete all the file paths in a given delete list. @@ -1947,93 +2034,6 @@ def _is_string_in_refs_file(ref_id, refs_file_path): return True return False - def _put_metadata(self, metadata, pid, metadata_doc_name): - """Store contents of metadata to `[self.root]/metadata` using the hash of the - given PID and format ID as the permanent address. - - :param mixed metadata: String or path to metadata document. - :param str pid: Authority-based identifier. - :param str metadata_doc_name: Metadata document name - - :return: Address of the metadata document. - :rtype: Path - """ - logging.debug( - "FileHashStore - _put_metadata: Request to put metadata for pid: %s", pid - ) - # Create metadata tmp file and write to it - metadata_stream = Stream(metadata) - with closing(metadata_stream): - metadata_tmp = self._mktmpmetadata(metadata_stream) - - # Get target and related paths (permanent location) - metadata_directory = self._computehash(pid) - metadata_document_name = metadata_doc_name - rel_path = "/".join(self._shard(metadata_directory)) - full_path = self._get_store_path("metadata") / rel_path / metadata_document_name - - # Move metadata to target path - if os.path.exists(metadata_tmp): - try: - parent = full_path.parent - parent.mkdir(parents=True, exist_ok=True) - # Metadata will be replaced if it exists - shutil.move(metadata_tmp, full_path) - logging.debug( - "FileHashStore - _put_metadata: Successfully put metadata for pid: %s", - pid, - ) - return full_path - except Exception as err: - exception_string = ( - f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - if os.path.exists(metadata_tmp): - # Remove tmp metadata, calling app must re-upload - logging.debug( - "FileHashStore - _put_metadata: Deleting metadata for pid: %s", - pid, - ) - self.metadata.delete(metadata_tmp) - raise - else: - exception_string = ( - f"FileHashStore - _put_metadata: Attempt to move metadata for pid: {pid}" - + f", but metadata temp file not found: {metadata_tmp}" - ) - logging.error(exception_string) - raise FileNotFoundError(exception_string) - - def _mktmpmetadata(self, stream): - """Create a named temporary file with `stream` (metadata). - - :param Stream stream: Metadata stream. - - :return: Path/name of temporary file created and written into. - :rtype: str - """ - # Create temporary file in .../{store_path}/tmp - tmp_root_path = self._get_store_path("metadata") / "tmp" - tmp = self._mktmpfile(tmp_root_path) - - # tmp is a file-like object that is already opened for writing by default - logging.debug( - "FileHashStore - _mktmpmetadata: Writing stream to tmp metadata file: %s", - tmp.name, - ) - with tmp as tmp_file: - for data in stream: - tmp_file.write(self._cast_to_bytes(data)) - - logging.debug( - "FileHashStore - _mktmpmetadata: Successfully written to tmp metadata file: %s", - tmp.name, - ) - return tmp.name - - # FileHashStore Utility & Supporting Methods - def _verify_object_information( self, pid, @@ -2240,37 +2240,6 @@ def _delete_object_only(self, cid): self.object_locked_cids_th.remove(cid) self.object_cid_condition_th.notify() - @staticmethod - def _check_arg_data(data): - """Checks a data argument to ensure that it is either a string, path, or stream - object. - - :param data: Object to validate (string, path, or stream). - :type data: str, os.PathLike, io.BufferedReader - - :return: True if valid. - :rtype: bool - """ - if ( - not isinstance(data, str) - and not isinstance(data, Path) - and not isinstance(data, io.BufferedIOBase) - ): - exception_string = ( - "FileHashStore - _validate_arg_data: Data must be a path, string or buffered" - + f" stream type. Data type supplied: {type(data)}" - ) - logging.error(exception_string) - raise TypeError(exception_string) - if isinstance(data, str): - if data.strip() == "": - exception_string = ( - "FileHashStore - _validate_arg_data: Data string cannot be empty." - ) - logging.error(exception_string) - raise TypeError(exception_string) - return True - def _check_arg_algorithms_and_checksum( self, additional_algorithm, checksum, checksum_algorithm ): @@ -2551,21 +2520,6 @@ def _delete(self, entity, file): logging.error(exception_string) raise err - @staticmethod - def _rename_path_for_deletion(path): - """Rename a given path by appending '_delete' and move it to the renamed path. - - :param string path: Path to file to rename - - :return: Path to the renamed file - :rtype: str - """ - if isinstance(path, str): - path = Path(path) - delete_path = path.with_name(path.stem + "_delete" + path.suffix) - shutil.move(path, delete_path) - return delete_path - def _create_path(self, path): """Physically create the folder path (and all intermediate ones) on disk. @@ -2866,6 +2820,23 @@ def _release_reference_locked_pids(self, pid): ) logging.debug(end_sync_debug_msg) + # Other Static Methods + + @staticmethod + def _rename_path_for_deletion(path): + """Rename a given path by appending '_delete' and move it to the renamed path. + + :param string path: Path to file to rename + + :return: Path to the renamed file + :rtype: str + """ + if isinstance(path, str): + path = Path(path) + delete_path = path.with_name(path.stem + "_delete" + path.suffix) + shutil.move(path, delete_path) + return delete_path + @staticmethod def _get_file_paths(directory): """Get the file paths of a given directory if it exists @@ -2886,7 +2857,36 @@ def _get_file_paths(directory): else: return None - # Other Static Methods + @staticmethod + def _check_arg_data(data): + """Checks a data argument to ensure that it is either a string, path, or stream + object. + + :param data: Object to validate (string, path, or stream). + :type data: str, os.PathLike, io.BufferedReader + + :return: True if valid. + :rtype: bool + """ + if ( + not isinstance(data, str) + and not isinstance(data, Path) + and not isinstance(data, io.BufferedIOBase) + ): + exception_string = ( + "FileHashStore - _validate_arg_data: Data must be a path, string or buffered" + + f" stream type. Data type supplied: {type(data)}" + ) + logging.error(exception_string) + raise TypeError(exception_string) + if isinstance(data, str): + if data.strip() == "": + exception_string = ( + "FileHashStore - _validate_arg_data: Data string cannot be empty." + ) + logging.error(exception_string) + raise TypeError(exception_string) + return True @staticmethod def _check_integer(file_size): From fa55ab54dc2463872b6b37c7077120ef37f788ef Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 13:28:21 -0700 Subject: [PATCH 363/420] Add new static method '_read_small_file_content' and refactor 'filehashstore' --- src/hashstore/filehashstore.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 71db4ce1..0e99d3b4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -931,9 +931,7 @@ def delete_object(self, pid): self._rename_path_for_deletion(pid_ref_abs_path) ) # Remove pid from cid refs file - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - # Retrieve the cid - pid_refs_cid = pid_ref_file.read() + pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) cid_ref_abs_str = str(self._get_hashstore_cid_refs_path(pid_refs_cid)) # Remove if the pid refs is found if self._is_string_in_refs_file(pid, cid_ref_abs_str): @@ -1205,8 +1203,7 @@ def _find_object(self, pid): pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) if os.path.exists(pid_ref_abs_path): # Read the file to get the cid from the pid reference - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - pid_refs_cid = pid_ref_file.read() + pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) # Confirm that the cid reference file exists cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) @@ -2168,8 +2165,7 @@ def _verify_hashstore_references( raise CidRefsFileNotFound(exception_string) # Check the content of the reference files # Start with the cid - with open(pid_refs_path, "r", encoding="utf8") as f: - retrieved_cid = f.read() + retrieved_cid = self._read_small_file_content(pid_refs_path) if retrieved_cid != cid: exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file exists" @@ -2821,6 +2817,19 @@ def _release_reference_locked_pids(self, pid): logging.debug(end_sync_debug_msg) # Other Static Methods + @staticmethod + def _read_small_file_content(path_to_file): + """Read the contents of a file with the given path. This method is not optimized for + large files - so it should only be used for small files (like reference files). + + :param path path_to_file: Path to the file to read + + :return: Content of the given file + :rtype: str + """ + with open(path_to_file, "r", encoding="utf8") as opened_path: + content = opened_path.read() + return content @staticmethod def _rename_path_for_deletion(path): From 8dfd5aee35884884340482ca3c2909313b7976d7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 13:58:39 -0700 Subject: [PATCH 364/420] Resolve todo item in '_untag_object' for 'OrphanPidRefsFileFound', add missing logging statement and fix bug in '_validate_and_check_cid_lock' --- src/hashstore/filehashstore.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 0e99d3b4..a24566df 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1747,10 +1747,28 @@ def _untag_object(self, pid, cid): ) # Remove all files confirmed for deletion self._delete_marked_files(untag_obj_delete_list) + info_msg = f"_untag_object: Untagged pid: {pid} with cid: {cid}" + logging.info(info_msg) except OrphanPidRefsFileFound as oprff: - # TODO: Handle orphan pid refs - return + # `find_object` throws this exception when the cid refs file doesn't exist, + # so we only need to delete the pid refs file (pid is already locked) + pid_refs_path = self._get_hashstore_pid_refs_path(pid) + cid_read = self._read_small_file_content(pid_refs_path) + self._validate_and_check_cid_lock(pid, cid, cid_read) + + self._mark_pid_refs_file_for_deletion( + pid, untag_obj_delete_list, pid_refs_path + ) + self._delete_marked_files(untag_obj_delete_list) + + warn_msg = ( + f"_untag_object: Cid refs file does not exist for pid: {pid}." + + " Deleted orphan pid refs file. Additional info: " + + str(oprff) + ) + logging.warning(warn_msg) + except RefsFileExistsButCidObjMissing as rfebcom: # TODO: Handle refs existing but data obj missing return @@ -1914,7 +1932,7 @@ def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): self._check_string(cid, "cid") self._check_string(cid_to_check, "cid_to_check") - if cid is not cid_to_check: + if cid != cid_to_check: err_msg = ( f"_validate_and_check_cid_lock: cid provided: {cid_to_check} does not " f"match untag request for cid: {cid} and pid: {pid}" From 18440a0cb861f6920bcbfa6c50285ab0439ff636 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 14:25:55 -0700 Subject: [PATCH 365/420] Add '_untag_object' pytests to resolve todo items in test module --- tests/test_filehashstore.py | 69 +++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index d3df7519..7f1f8fb6 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1507,8 +1507,73 @@ def test_delete_with_object_metadata_id(pids, store): # TODO: Add untag pytest for pid and cid successfully untagged -# TODO: Add untag pytest for exception thrown when pid is not locked -# TODO: Add untag pytest for exception thrown when cid is not locked +def test_untag_object(pids, store): + """Test _untag_object untags successfully.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + assert store._count("objects") == 3 + + +def test_untag_object_pid_not_locked(pids, store): + """Test _untag_object throws exception when pid is not locked""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + with pytest.raises(IdentifierNotLocked): + store._untag_object(pid, cid) + + +def test_untag_object_cid_not_locked(pids, store): + """Test _untag_object throws exception with cid is not locked""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + with pytest.raises(IdentifierNotLocked): + store._synchronize_referenced_locked_pids(pid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + + +def test_untag_object_orphan_pid_refs_file_found(pids, store): + """Test _untag_object removes an orphan pid refs file""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + # Remove cid refs file + cid_refs_abs_path = store._get_hashstore_cid_refs_path(cid) + os.remove(cid_refs_abs_path) + + with pytest.raises(OrphanPidRefsFileFound): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 def test_create_path(pids, store): From a998055b1e480079443fc0aa728652a5d811c215 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 15:13:26 -0700 Subject: [PATCH 366/420] Resolve todo item in '_untag_object' for 'RefsFileExistsButCidObjMissing', fix bug in '_untag_object' when catching exceptions and add new pytest --- src/hashstore/filehashstore.py | 33 ++++++++++++++++++++++++++++++--- tests/test_filehashstore.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a24566df..4fee2b60 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1695,7 +1695,10 @@ def _store_hashstore_refs_files(self, pid, cid): ) logging.info(info_msg) - except HashStoreRefsAlreadyExists or PidRefsAlreadyExistsError as expected_exceptions: + except ( + HashStoreRefsAlreadyExists, + PidRefsAlreadyExistsError, + ) as expected_exceptions: raise expected_exceptions except Exception as unexpected_exception: @@ -1757,6 +1760,7 @@ def _untag_object(self, pid, cid): cid_read = self._read_small_file_content(pid_refs_path) self._validate_and_check_cid_lock(pid, cid, cid_read) + # Remove pid refs self._mark_pid_refs_file_for_deletion( pid, untag_obj_delete_list, pid_refs_path ) @@ -1770,8 +1774,31 @@ def _untag_object(self, pid, cid): logging.warning(warn_msg) except RefsFileExistsButCidObjMissing as rfebcom: - # TODO: Handle refs existing but data obj missing - return + # `find_object` throws this exception when both pid/cid refs files exist but the + # actual data object does not. + pid_refs_path = self._get_hashstore_pid_refs_path(pid) + cid_read = self._read_small_file_content(pid_refs_path) + self._validate_and_check_cid_lock(pid, cid, cid_read) + + # Remove pid refs + self._mark_pid_refs_file_for_deletion( + pid, untag_obj_delete_list, pid_refs_path + ) + # Remove pid from cid refs + cid_refs_path = self._get_hashstore_cid_refs_path(cid) + self._remove_pid_and_handle_cid_refs_deletion( + pid, untag_obj_delete_list, cid_refs_path + ) + # Remove all files confirmed for deletion + self._delete_marked_files(untag_obj_delete_list) + + warn_msg = ( + f"_untag_object: data object for cid: {cid_read}. does not exist, but pid and cid " + f"references files found for pid: {pid}, Deleted pid and cid refs files. " + f"Additional info: " + str(rfebcom) + ) + logging.warning(warn_msg) + except PidNotFoundInCidRefsFile as pnficrf: # TODO: Handle refs exist but pid is not found in cid refs return diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 7f1f8fb6..df024381 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1506,7 +1506,6 @@ def test_delete_with_object_metadata_id(pids, store): assert store._count(entity) == 0 -# TODO: Add untag pytest for pid and cid successfully untagged def test_untag_object(pids, store): """Test _untag_object untags successfully.""" test_dir = "tests/testdata/" @@ -1576,6 +1575,34 @@ def test_untag_object_orphan_pid_refs_file_found(pids, store): assert store._count("pid") == 0 +def test_untag_object_orphan_refs_exist_but_data_object_not_found(pids, store): + """Test _untag_object removes orphaned pid and cid refs files""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + # Remove cid refs file + data_obj_path = store._get_hashstore_data_object_path(cid) + os.remove(data_obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + + def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: From 0a15f88528fae49a771227139a0b6d0f2dcbb158 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 15:44:10 -0700 Subject: [PATCH 367/420] Resolve todo item in '_untag_objet' for 'PidNotFoundInCidRefsFile' and add new pytest --- src/hashstore/filehashstore.py | 21 +++++++++++++++++++-- tests/test_filehashstore.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4fee2b60..c63af9ff 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1800,8 +1800,25 @@ def _untag_object(self, pid, cid): logging.warning(warn_msg) except PidNotFoundInCidRefsFile as pnficrf: - # TODO: Handle refs exist but pid is not found in cid refs - return + # `find_object` throws this exception when both the pid and cid refs file exists + # but the pid is not found in the cid refs file + pid_refs_path = self._get_hashstore_pid_refs_path(pid) + cid_read = self._read_small_file_content(pid_refs_path) + self._validate_and_check_cid_lock(pid, cid, cid_read) + + # Remove pid refs + self._mark_pid_refs_file_for_deletion( + pid, untag_obj_delete_list, pid_refs_path + ) + self._delete_marked_files(untag_obj_delete_list) + + warn_msg = ( + f"_untag_object: pid not found in expected cid refs file for pid: {pid}. " + + "Deleted orphan pid refs file. Additional info: " + + str(pnficrf) + ) + logging.warning(warn_msg) + except PidRefsDoesNotExist as prdne: # TODO: Handle cid refs to ensure pid not found in it return diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index df024381..fe3e3931 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1603,6 +1603,37 @@ def test_untag_object_orphan_refs_exist_but_data_object_not_found(pids, store): assert store._count("cid") == 0 +def test_untag_object_refs_found_but_pid_not_in_cid_refs(pids, store): + """Test _untag_object removes pid refs file whose pid is not found in the cid refs file.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + pid_two = pid + ".dou" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + _object_metadata_two = store.store_object(pid_two, path) + cid = object_metadata.cid + + assert store._count("pid") == 2 + assert store._count("cid") == 1 + + # Remove pid from cid refs + cid_refs_file = store._get_hashstore_cid_refs_path(cid) + # First remove the pid + store._update_refs_file(cid_refs_file, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: From 8403bf62106a5e8372251ff8e83910131121012d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 15:52:11 -0700 Subject: [PATCH 368/420] Resolve todo item in '_untag_object' for PidRefsDoesNotExist and add new pytests --- src/hashstore/filehashstore.py | 19 +++++++++- tests/test_filehashstore.py | 67 ++++++++++++++++++++++++++++++++-- 2 files changed, 81 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c63af9ff..088d25d4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1820,8 +1820,23 @@ def _untag_object(self, pid, cid): logging.warning(warn_msg) except PidRefsDoesNotExist as prdne: - # TODO: Handle cid refs to ensure pid not found in it - return + # `find_object` throws this exception if the pid refs file is not found + # Check to see if pid is in the 'cid refs file' and attempt to remove it + self._check_object_locked_cids(cid) + + # Remove pid from cid refs + cid_refs_path = self._get_hashstore_cid_refs_path(cid) + self._remove_pid_and_handle_cid_refs_deletion( + pid, untag_obj_delete_list, cid_refs_path + ) + # Remove all files confirmed for deletion + self._delete_marked_files(untag_obj_delete_list) + + warn_msg = ( + f"Pid refs file not found, removed pid from cid refs file for cid: {cid}" + + str(prdne) + ) + logging.warning(warn_msg) def _put_metadata(self, metadata, pid, metadata_doc_name): """Store contents of metadata to `[self.root]/metadata` using the hash of the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index fe3e3931..ec02eaeb 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1551,7 +1551,7 @@ def test_untag_object_cid_not_locked(pids, store): store._release_reference_locked_pids(pid) -def test_untag_object_orphan_pid_refs_file_found(pids, store): +def test_untag_object_orphan_pid_refs_file_found(store): """Test _untag_object removes an orphan pid refs file""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -1575,7 +1575,7 @@ def test_untag_object_orphan_pid_refs_file_found(pids, store): assert store._count("pid") == 0 -def test_untag_object_orphan_refs_exist_but_data_object_not_found(pids, store): +def test_untag_object_orphan_refs_exist_but_data_object_not_found(store): """Test _untag_object removes orphaned pid and cid refs files""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -1603,7 +1603,7 @@ def test_untag_object_orphan_refs_exist_but_data_object_not_found(pids, store): assert store._count("cid") == 0 -def test_untag_object_refs_found_but_pid_not_in_cid_refs(pids, store): +def test_untag_object_refs_found_but_pid_not_in_cid_refs(store): """Test _untag_object removes pid refs file whose pid is not found in the cid refs file.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -1634,6 +1634,67 @@ def test_untag_object_refs_found_but_pid_not_in_cid_refs(pids, store): assert store._count("cid") == 1 +def test_untag_object_pid_refs_file_does_not_exist(store): + """Test _untag_object removes pid from cid refs file since the pid refs file does not exist, + and does not delete the cid refs file because a reference is still present.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + pid_two = pid + ".dou" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + _object_metadata_two = store.store_object(pid_two, path) + cid = object_metadata.cid + + assert store._count("pid") == 2 + assert store._count("cid") == 1 + + # Remove pid from cid refs + pid_refs_file = store._get_hashstore_pid_refs_path(pid) + os.remove(pid_refs_file) + + with pytest.raises(PidRefsDoesNotExist): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + +def test_untag_object_pid_refs_file_does_not_exist_and_cid_refs_is_empty(store): + """Test '_untag_object' removes pid from cid refs file since the pid refs file does not exist, + and deletes the cid refs file because it contains no more references (after the pid called + with '_untag_object' is removed from the cid refs).""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + # Remove pid from cid refs + pid_refs_file = store._get_hashstore_pid_refs_path(pid) + os.remove(pid_refs_file) + + with pytest.raises(PidRefsDoesNotExist): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + + def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: From 4a1ab3d6238871d74a2bb6136d983c9b22489d04 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 17 Sep 2024 18:18:02 -0700 Subject: [PATCH 369/420] Update 'README.md' to resolve linting warnings and move 'filehashstore' test modules into its respective test folder --- README.md | 8 ++++---- tests/filehashstore/__init__.py | 0 tests/{ => filehashstore}/test_filehashstore.py | 0 tests/{ => filehashstore}/test_filehashstore_interface.py | 0 4 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 tests/filehashstore/__init__.py rename tests/{ => filehashstore}/test_filehashstore.py (100%) rename tests/{ => filehashstore}/test_filehashstore_interface.py (100%) diff --git a/README.md b/README.md index d4feaf10..163fa1c4 100644 --- a/README.md +++ b/README.md @@ -85,8 +85,8 @@ hashstore = hashstore_factory.get_hashstore(module_name, class_name, properties) # Store objects (.../[hashstore_path]/objects/) pid = "j.tao.1700.1" -object = "/path/to/your/object.data" -object_metadata = hashstore.store_object(pid, object) +object_path = "/path/to/your/object.data" +object_metadata = hashstore.store_object(pid, object_path) object_cid = object_metadata.cid # Store metadata (.../[hashstore_path]/metadata/) @@ -95,11 +95,11 @@ pid = "j.tao.1700.1" sysmeta = "/path/to/your/sysmeta/document.xml" metadata_cid = hashstore.store_metadata(pid, sysmeta) -# If you want to store other types of metadata, add an additional `format_id`. +# If you want to store other types of metadata, include a `format_id`. pid = "j.tao.1700.1" metadata = "/path/to/your/metadata/document.json" format_id = "http://custom.metadata.com/json/type/v1.0" -metadata_cid = hashstore.store_metadata(pid, metadata, format_id) +metadata_cid_two = hashstore.store_metadata(pid, metadata, format_id) # ... ``` diff --git a/tests/filehashstore/__init__.py b/tests/filehashstore/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py similarity index 100% rename from tests/test_filehashstore.py rename to tests/filehashstore/test_filehashstore.py diff --git a/tests/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py similarity index 100% rename from tests/test_filehashstore_interface.py rename to tests/filehashstore/test_filehashstore_interface.py From 65f4eeb202984ffc45dfae1b2451064f41f26b0f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 10:10:15 -0700 Subject: [PATCH 370/420] Update version number and authors in 'pyproject.toml' --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 47bf6dbf..92d608f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,9 @@ [tool.poetry] name = "hashstore" -version = "1.0.0" +version = "1.1.0" description = "HashStore, an object storage system using content identifiers." -authors = ["Dou Mok ", "Matt Jones "] +authors = ["Dou Mok ", "Matt Jones ", + "Matthew Brooke", "Jing Tao", "Jeanette Clark", "Ian M. Nesbitt"] readme = "README.md" keywords = ["filesystem", "object storage", "hashstore", "storage"] classifiers = [ From a08a6b10e4eb889e59442068fcb211b4f0fa30e7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 10:46:25 -0700 Subject: [PATCH 371/420] Review and clean up 'filehashstore_interface' test module for 'store_object' & 'tag_object' pytests and resolve todo items for missing 'tag_object' pytests --- .../test_filehashstore_interface.py | 166 +++++++++++------- 1 file changed, 107 insertions(+), 59 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 3ada26eb..321b923a 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -14,6 +14,8 @@ NonMatchingObjSize, PidRefsDoesNotExist, UnsupportedAlgorithm, + HashStoreRefsAlreadyExists, + PidRefsAlreadyExistsError, ) # pylint: disable=W0212 @@ -29,12 +31,11 @@ def test_store_object_refs_files_and_object(pids, store): """Test store object stores objects and creates reference files.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) object_metadata = store.store_object(pid, path) assert object_metadata.cid == pids[pid][store.algorithm] - assert store._count(entity) == 3 + assert store._count("objects") == 3 assert store._count("pid") == 3 assert store._count("cid") == 3 @@ -42,12 +43,11 @@ def test_store_object_refs_files_and_object(pids, store): def test_store_object_only_object(pids, store): """Test store object stores an object only (no reference files will be created)""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) object_metadata = store.store_object(data=path) assert object_metadata.cid == pids[pid][store.algorithm] - assert store._count(entity) == 3 + assert store._count("objects") == 3 assert store._count("pid") == 0 assert store._count("cid") == 0 @@ -55,36 +55,33 @@ def test_store_object_only_object(pids, store): def test_store_object_files_path(pids, store): """Test store object when given a path object.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) _object_metadata = store.store_object(pid, path) - assert store._exists(entity, pids[pid][store.algorithm]) - assert store._count(entity) == 3 + assert store._exists("objects", pids[pid][store.algorithm]) + assert store._count("objects") == 3 def test_store_object_files_string(pids, store): """Test store object when given a string object.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") _object_metadata = store.store_object(pid, path_string) - assert store._exists(entity, pids[pid][store.algorithm]) - assert store._count(entity) == 3 + assert store._exists("objects", pids[pid][store.algorithm]) + assert store._count("objects") == 3 def test_store_object_files_input_stream(pids, store): """Test store object when given a stream object.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") _object_metadata = store.store_object(pid, input_stream) input_stream.close() - assert store._exists(entity, pids[pid][store.algorithm]) - assert store._count(entity) == 3 + assert store._exists("objects", pids[pid][store.algorithm]) + assert store._count("objects") == 3 def test_store_object_cid(pids, store): @@ -170,6 +167,23 @@ def test_store_object_data_incorrect_type_empty_spaces(store): store.store_object(pid, data=path) +def test_store_object_data_incorrect_type_special_characters(store): + """Test store object raises error when data is empty string with special characters""" + pid = "jtao.1700.1" + path = " \n\t" + with pytest.raises(TypeError): + store.store_object(pid, data=path) + + +def test_store_object_data_incorrect_type_path_with_special_character(store): + """Test store object raises error when data path contains special characters.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + "\n" + with pytest.raises(ValueError): + store.store_object("", path) + + def test_store_object_additional_algorithm_invalid(store): """Test store object raises error when supplied with unsupported algorithm.""" test_dir = "tests/testdata/" @@ -183,20 +197,18 @@ def test_store_object_additional_algorithm_invalid(store): def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): """Test store object accepts an additional algo that's supported in uppercase.""" test_dir = "tests/testdata/" - entity = "objects" pid = "jtao.1700.1" path = test_dir + pid algorithm_with_hyphen_and_upper = "SHA-384" object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - assert store._exists(entity, pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): """Test store object accepts an additional algo that's supported in lowercase.""" test_dir = "tests/testdata/" - entity = "objects" pid = "jtao.1700.1" path = test_dir + pid algorithm_other = "sha3-256" @@ -206,13 +218,12 @@ def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - assert store._exists(entity, pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_object_additional_algorithm_underscore(pids, store): """Test store object accepts an additional algo that's supported with underscore.""" test_dir = "tests/testdata/" - entity = "objects" pid = "jtao.1700.1" path = test_dir + pid algorithm_other = "sha3_256" @@ -222,13 +233,12 @@ def test_store_object_additional_algorithm_underscore(pids, store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - assert store._exists(entity, pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_object_checksum_correct(store): """Test store object does not throw exception with good checksum.""" test_dir = "tests/testdata/" - entity = "objects" pid = "jtao.1700.1" path = test_dir + pid checksum_algo = "sha3_256" @@ -238,7 +248,7 @@ def test_store_object_checksum_correct(store): _object_metadata = store.store_object( pid, path, checksum=checksum_correct, checksum_algorithm=checksum_algo ) - assert store._count(entity) == 1 + assert store._count("objects") == 1 def test_store_object_checksum_correct_and_additional_algo(store): @@ -285,18 +295,6 @@ def test_store_object_checksum_correct_and_additional_algo_duplicate(store): assert object_metadata.hex_digests.get("sha3_256") == checksum_correct -def test_store_object_checksum_algorithm_empty(store): - """Test store object raises error when checksum supplied with no checksum_algorithm.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - checksum_correct = ( - "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" - ) - with pytest.raises(ValueError): - store.store_object(pid, path, checksum=checksum_correct, checksum_algorithm="") - - def test_store_object_checksum_empty(store): """Test store object raises error when checksum_algorithm supplied with an empty checksum.""" @@ -323,21 +321,6 @@ def test_store_object_checksum_empty_spaces(store): ) -def test_store_object_checksum_algorithm_empty_spaces(store): - """Test store object raises error when checksum is supplied and with empty - spaces as the checksum_algorithm.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - checksum_correct = ( - "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" - ) - with pytest.raises(ValueError): - store.store_object( - pid, path, checksum=checksum_correct, checksum_algorithm=" " - ) - - def test_store_object_checksum_incorrect_checksum(store): """Test store object raises error when supplied with incorrect checksum.""" test_dir = "tests/testdata/" @@ -368,19 +351,60 @@ def test_store_object_checksum_unsupported_checksum_algo(store): ) +def test_store_object_checksum_algorithm_empty(store): + """Test store object raises error when checksum supplied with no checksum_algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + with pytest.raises(ValueError): + store.store_object(pid, path, checksum=checksum_correct, checksum_algorithm="") + + +def test_store_object_checksum_algorithm_empty_spaces(store): + """Test store object raises error when checksum is supplied and with empty + spaces as the checksum_algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + with pytest.raises(ValueError): + store.store_object( + pid, path, checksum=checksum_correct, checksum_algorithm=" " + ) + + +def test_store_object_checksum_algorithm_special_character(store): + """Test store object raises error when checksum is supplied and with special characters + as the checksum_algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + with pytest.raises(ValueError): + store.store_object( + pid, path, checksum=checksum_correct, checksum_algorithm="\n" + ) + + def test_store_object_duplicate_does_not_store_duplicate(store): """Test that storing duplicate object does not store object twice.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid - entity = "objects" # Store first blob _object_metadata_one = store.store_object(pid, path) # Store second blob pid_that_refs_existing_cid = "dou.test.1" _object_metadata_two = store.store_object(pid_that_refs_existing_cid, path) # Confirm only one object exists and the tmp file created is deleted - assert store._count(entity) == 1 + assert store._count("objects") == 1 def test_store_object_duplicate_object_references_file_count(store): @@ -401,11 +425,12 @@ def test_store_object_duplicate_object_references_file_count(store): assert store._count("pid") == 3 # Confirm that there are 1 cid reference files assert store._count("cid") == 1 + assert store._count("objects") == 1 def test_store_object_duplicate_object_references_file_content(pids, store): """Test that storing duplicate object but different pid updates the cid refs file - with the correct amount of pids.""" + with the correct amount of pids and content.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -417,13 +442,17 @@ def test_store_object_duplicate_object_references_file_content(pids, store): # Store with third pid pid_three = "dou.test.2" store.store_object(pid_three, path) - # Confirm the content of the cid refence files + # Confirm the content of the cid reference files cid_ref_abs_path = store._get_hashstore_cid_refs_path(pids[pid][store.algorithm]) + cid_count = 0 with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): + cid_count += 1 value = line.strip() assert value == pid or value == pid_two or value == pid_three + assert cid_count == 3 + def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, store): """Test store duplicate object throws exception when the data to validate against @@ -431,7 +460,6 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid - entity = "objects" # Store first blob _object_metadata_one = store.store_object(pid, path) # Store second blob @@ -439,10 +467,10 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor _object_metadata_two = store.store_object( pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) - assert store._count(entity) == 1 + assert store._count("objects") == 1 # Confirm tmp files created during this process was handled assert store._count("tmp") == 0 - assert store._exists(entity, pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_object_with_obj_file_size(store, pids): @@ -466,6 +494,7 @@ def test_store_object_with_obj_file_size_incorrect(store, pids): path = test_dir + pid.replace("/", "_") with pytest.raises(NonMatchingObjSize): store.store_object(pid, path, expected_object_size=obj_file_size) + assert store._count("objects") == 0 def test_store_object_with_obj_file_size_non_integer(store, pids): @@ -756,14 +785,33 @@ def test_tag_object_pid_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 -# TODO: Add tag_ojbect test for HashStoreRefsAlreadyExists -# TODO: Add tag_ojbect test for PidRefsAlreadyExistsError +def test_tag_object_hashstore_refs_already_exist(pids, store): + """Confirm that tag throws HashStoreRefsAlreadyExists when refs already exist""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + with pytest.raises(HashStoreRefsAlreadyExists): + store.tag_object(pid, object_metadata.cid) + + +def test_tag_object_pid_refs_already_exist(pids, store): + """Confirm that tag throws PidRefsAlreadyExistsError when a pid refs already exists""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid_refs_file_path = store._get_hashstore_cid_refs_path(object_metadata.cid) + os.remove(cid_refs_file_path) + + with pytest.raises(PidRefsAlreadyExistsError): + store.tag_object(pid, "adifferentcid") def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" - entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" @@ -777,7 +825,7 @@ def test_store_metadata(pids, store): store._get_store_path("metadata") / rel_path / metadata_document_name ) assert metadata_cid == str(full_path) - assert store._count(entity) == 3 + assert store._count("metadata") == 3 def test_store_metadata_one_pid_multiple_docs_correct_location(store): From 2e67b39a6c24111804495768d52a8b178f0610c9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 10:47:24 -0700 Subject: [PATCH 372/420] Re-organize 'delete_if_invalid_object' in 'filehashstore' module to match java library interface order --- src/hashstore/filehashstore.py | 48 +++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 088d25d4..588d3043 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -587,6 +587,30 @@ def store_object( return object_metadata + def tag_object(self, pid, cid): + logging.debug( + "FileHashStore - tag_object: Tagging object cid: %s with pid: %s.", + cid, + pid, + ) + self._check_string(pid, "pid") + self._check_string(cid, "cid") + + try: + self._store_hashstore_refs_files(pid, cid) + except HashStoreRefsAlreadyExists as hrae: + err_msg = ( + f"FileHashStore - tag_object: reference files for pid: {pid} and {cid} " + "already exist. " + str(hrae) + ) + raise HashStoreRefsAlreadyExists(err_msg) + except PidRefsAlreadyExistsError as praee: + err_msg = ( + f"FileHashStore - tag_object: A pid can only reference one cid. " + + str(praee) + ) + raise PidRefsAlreadyExistsError(err_msg) + def delete_if_invalid_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): @@ -633,30 +657,6 @@ def delete_if_invalid_object( object_metadata.cid, ) - def tag_object(self, pid, cid): - logging.debug( - "FileHashStore - tag_object: Tagging object cid: %s with pid: %s.", - cid, - pid, - ) - self._check_string(pid, "pid") - self._check_string(cid, "cid") - - try: - self._store_hashstore_refs_files(pid, cid) - except HashStoreRefsAlreadyExists as hrae: - err_msg = ( - f"FileHashStore - tag_object: reference files for pid: {pid} and {cid} " - "already exist. " + str(hrae) - ) - raise HashStoreRefsAlreadyExists(err_msg) - except PidRefsAlreadyExistsError as praee: - err_msg = ( - f"FileHashStore - tag_object: A pid can only reference one cid. " - + str(praee) - ) - raise PidRefsAlreadyExistsError(err_msg) - def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid From 7ff4722b784c16b106dae535a7cd61e01f5468d1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 10:52:47 -0700 Subject: [PATCH 373/420] Re-organize and clean up 'delete_if_invalid_object' method pytests in 'filehashstore' interface test module --- .../test_filehashstore_interface.py | 277 +++++++++--------- 1 file changed, 140 insertions(+), 137 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 321b923a..2fb45f2a 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -809,6 +809,146 @@ def test_tag_object_pid_refs_already_exist(pids, store): store.tag_object(pid, "adifferentcid") +def test_delete_if_invalid_object(pids, store): + """Test delete_if_invalid_object does not throw exception given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + store.delete_if_invalid_object( + object_metadata, checksum, checksum_algorithm, expected_file_size + ) + assert store._exists("objects", object_metadata.cid) + + +def test_delete_if_invalid_object_supported_other_algo_not_in_default(pids, store): + """Test delete_if_invalid_object does not throw exception when supported add algo is + supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + supported_algo = "sha224" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = pids[pid][supported_algo] + expected_file_size = object_metadata.obj_size + store.delete_if_invalid_object( + object_metadata, checksum, supported_algo, expected_file_size + ) + assert store._exists("objects", object_metadata.cid) + + +def test_delete_if_invalid_object_exception_incorrect_object_metadata_type(pids, store): + """Test delete_if_invalid_object throws exception when incorrect obj type is given to + object_metadata arg.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.delete_if_invalid_object( + "not_object_metadata", checksum, checksum_algorithm, expected_file_size + ) + + +def test_delete_if_invalid_object_exception_incorrect_size(pids, store): + """Test delete_if_invalid_object throws exception when incorrect size is supplied and that data + object is deleted as we are storing without a pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + + with pytest.raises(NonMatchingObjSize): + store.delete_if_invalid_object( + object_metadata, checksum, checksum_algorithm, 1000 + ) + + assert not store._exists("objects", object_metadata.cid) + + +def test_delete_if_invalid_object_exception_incorrect_size_object_exists(pids, store): + """Test delete_if_invalid_object throws exception when incorrect size is supplied and that data + object is not deleted since it already exists (a cid refs file is present).""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, data=path) + # Store again without pid and wrong object size + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + + with pytest.raises(NonMatchingObjSize): + store.delete_if_invalid_object( + object_metadata, checksum, checksum_algorithm, 1000 + ) + + assert store._exists("objects", object_metadata.cid) + assert store._count("tmp") == 0 + + +def test_delete_if_invalid_object_exception_incorrect_checksum(pids, store): + """Test delete_if_invalid_object throws exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + + with pytest.raises(NonMatchingChecksum): + store.delete_if_invalid_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) + + assert not store._exists("objects", object_metadata.cid) + + +def test_delete_if_invalid_object_exception_incorrect_checksum_algo(pids, store): + """Test delete_if_invalid_object throws exception when unsupported algorithm is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(UnsupportedAlgorithm): + store.delete_if_invalid_object( + object_metadata, checksum, "md2", expected_file_size + ) + + assert store._exists("objects", object_metadata.cid) + assert store._count("tmp") == 0 + + +def test_delete_if_invalid_object_exception_supported_other_algo_bad_checksum( + pids, store +): + """Test delete_if_invalid_object throws exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(NonMatchingChecksum): + store.delete_if_invalid_object( + object_metadata, checksum, "sha224", expected_file_size + ) + + assert not store._exists("objects", object_metadata.cid) + + def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" @@ -1193,143 +1333,6 @@ def test_delete_object_pid_none(store): store.delete_object(pid) -def test_delete_invalid_object(pids, store): - """Test delete_invalid_object does not throw exception given good arguments.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - store.delete_if_invalid_object( - object_metadata, checksum, checksum_algorithm, expected_file_size - ) - assert store._exists("objects", object_metadata.cid) - - -def test_delete_invalid_object_supported_other_algo_not_in_default(pids, store): - """Test delete_invalid_object does not throw exception when supported add algo is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - supported_algo = "sha224" - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = pids[pid][supported_algo] - expected_file_size = object_metadata.obj_size - store.delete_if_invalid_object( - object_metadata, checksum, supported_algo, expected_file_size - ) - assert store._exists("objects", object_metadata.cid) - - -def test_delete_invalid_object_exception_incorrect_object_metadata_type(pids, store): - """Test delete_invalid_object throws exception when incorrect class type is given to - object_metadata arg.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - with pytest.raises(ValueError): - store.delete_if_invalid_object( - "not_object_metadata", checksum, checksum_algorithm, expected_file_size - ) - - -def test_delete_invalid_object_exception_incorrect_size(pids, store): - """Test delete_invalid_object throws exception when incorrect size is supplied and that data - object is deleted as we are storing without a pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - - with pytest.raises(NonMatchingObjSize): - store.delete_if_invalid_object( - object_metadata, checksum, checksum_algorithm, 1000 - ) - - assert not store._exists("objects", object_metadata.cid) - - -def test_delete_invalid_object_exception_incorrect_size_object_exists(pids, store): - """Test delete_invalid_object throws exception when incorrect size is supplied and that data - object is not deleted since it already exists (a cid refs file is present).""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - store.store_object(pid, data=path) - # Store again without pid and wrong object size - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - - with pytest.raises(NonMatchingObjSize): - store.delete_if_invalid_object( - object_metadata, checksum, checksum_algorithm, 1000 - ) - - assert store._exists("objects", object_metadata.cid) - assert store._count("tmp") == 0 - - -def test_delete_invalid_object_exception_incorrect_checksum(pids, store): - """Test delete_invalid_object throws exception when incorrect checksum is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - - with pytest.raises(NonMatchingChecksum): - store.delete_if_invalid_object( - object_metadata, "abc123", checksum_algorithm, expected_file_size - ) - - assert not store._exists("objects", object_metadata.cid) - - -def test_delete_invalid_object_exception_incorrect_checksum_algo(pids, store): - """Test delete_invalid_object throws exception when unsupported algorithm is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - expected_file_size = object_metadata.obj_size - with pytest.raises(UnsupportedAlgorithm): - store.delete_if_invalid_object( - object_metadata, checksum, "md2", expected_file_size - ) - - assert store._exists("objects", object_metadata.cid) - assert store._count("tmp") == 0 - - -def test_delete_invalid_object_exception_supported_other_algo_bad_checksum(pids, store): - """Test delete_invalid_object throws exception when incorrect checksum is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - expected_file_size = object_metadata.obj_size - with pytest.raises(NonMatchingChecksum): - store.delete_if_invalid_object( - object_metadata, checksum, "sha224", expected_file_size - ) - - assert not store._exists("objects", object_metadata.cid) - - def test_delete_metadata(pids, store): """Test delete_metadata successfully deletes metadata.""" test_dir = "tests/testdata/" From 73226da4449c456a349e86a2a96a59cc81236ebd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 12:55:08 -0700 Subject: [PATCH 374/420] Review and clean-up 'store_metadata' pytests --- .../test_filehashstore_interface.py | 92 ++++++++++--------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 2fb45f2a..b87dc0f4 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -950,13 +950,13 @@ def test_delete_if_invalid_object_exception_supported_other_algo_bad_checksum( def test_store_metadata(pids, store): - """Test store metadata.""" + """Test store_metadata.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) @@ -964,12 +964,12 @@ def test_store_metadata(pids, store): full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) - assert metadata_cid == str(full_path) + assert stored_metadata_path == str(full_path) assert store._count("metadata") == 3 def test_store_metadata_one_pid_multiple_docs_correct_location(store): - """Test store metadata for a pid with multiple metadata documents.""" + """Test store_metadata for a pid with multiple metadata documents.""" test_dir = "tests/testdata/" entity = "metadata" pid = "jtao.1700.1" @@ -980,29 +980,31 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" - metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_cid3 = store.store_metadata(pid, syspath, format_id3) - metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) + stored_metadata_path4 = store.store_metadata(pid, syspath, format_id4) + metadata_document_name = store._computehash(pid + format_id) metadata_document_name3 = store._computehash(pid + format_id3) metadata_document_name4 = store._computehash(pid + format_id4) full_path = store._get_store_path("metadata") / rel_path / metadata_document_name full_path3 = store._get_store_path("metadata") / rel_path / metadata_document_name3 full_path4 = store._get_store_path("metadata") / rel_path / metadata_document_name4 - assert metadata_cid == str(full_path) - assert metadata_cid3 == str(full_path3) - assert metadata_cid4 == str(full_path4) + + assert stored_metadata_path == str(full_path) + assert stored_metadata_path3 == str(full_path3) + assert stored_metadata_path4 == str(full_path4) assert store._count(entity) == 3 def test_store_metadata_default_format_id(pids, store): - """Test store metadata returns expected id when storing with default format_id.""" + """Test store_metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath) + stored_metadata_path = store.store_metadata(pid, syspath) # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) @@ -1010,24 +1012,24 @@ def test_store_metadata_default_format_id(pids, store): full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) - assert metadata_cid == str(full_path) + assert stored_metadata_path == str(full_path) def test_store_metadata_files_string(pids, store): - """Test store metadata with a string object to the metadata.""" + """Test store_metadata with a string object to the metadata.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) - metadata_cid = store.store_metadata(pid, syspath_string, format_id) - assert store._exists(entity, metadata_cid) + stored_metadata_path = store.store_metadata(pid, syspath_string, format_id) + assert store._exists(entity, stored_metadata_path) assert store._count(entity) == 3 def test_store_metadata_files_input_stream(pids, store): - """Test store metadata with an input stream to metadata.""" + """Test store_metadata with a stream to the metadata.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1035,13 +1037,13 @@ def test_store_metadata_files_input_stream(pids, store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") - _metadata_cid = store.store_metadata(pid, syspath_stream, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath_stream, format_id) syspath_stream.close() assert store._count(entity) == 3 def test_store_metadata_pid_empty(store): - """Test store metadata raises error with an empty string as the pid.""" + """Test store_metadata raises error with an empty string as the pid.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "" @@ -1052,7 +1054,7 @@ def test_store_metadata_pid_empty(store): def test_store_metadata_pid_empty_spaces(store): - """Test store metadata raises error with empty spaces as the pid.""" + """Test store_metadata raises error with empty spaces as the pid.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " @@ -1063,7 +1065,7 @@ def test_store_metadata_pid_empty_spaces(store): def test_store_metadata_pid_format_id_spaces(store): - """Test store metadata raises error with empty spaces as the format_id.""" + """Test store_metadata raises error with empty spaces as the format_id.""" test_dir = "tests/testdata/" format_id = " " pid = "jtao.1700.1" @@ -1074,7 +1076,7 @@ def test_store_metadata_pid_format_id_spaces(store): def test_store_metadata_metadata_empty(store): - """Test store metadata raises error with empty spaces as the metadata path.""" + """Test store_metadata raises error with empty spaces as the metadata path.""" pid = "jtao.1700.1" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" syspath_string = " " @@ -1083,7 +1085,7 @@ def test_store_metadata_metadata_empty(store): def test_store_metadata_metadata_none(store): - """Test store metadata raises error with empty None metadata path.""" + """Test store_metadata raises error with empty None metadata path.""" pid = "jtao.1700.1" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" syspath_string = None @@ -1092,7 +1094,7 @@ def test_store_metadata_metadata_none(store): def test_store_metadata_metadata_path(pids, store): - """Test store metadata returns expected path to metadata document.""" + """Test store_metadata returns expected path to metadata document.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): @@ -1100,13 +1102,13 @@ def test_store_metadata_metadata_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_path = store._get_hashstore_metadata_path(metadata_cid) - assert metadata_cid == metadata_path + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + metadata_path = store._get_hashstore_metadata_path(stored_metadata_path) + assert stored_metadata_path == metadata_path def test_store_metadata_thread_lock(store): - """Test store metadata thread lock.""" + """Test store_metadata thread lock.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1172,7 +1174,7 @@ def test_retrieve_metadata(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) metadata_stream = store.retrieve_metadata(pid, format_id) metadata_content = metadata_stream.read().decode("utf-8") metadata_stream.close() @@ -1188,7 +1190,7 @@ def test_retrieve_metadata_default_format_id(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath) + _stored_metadata_path = store.store_metadata(pid, syspath) metadata_stream = store.retrieve_metadata(pid) metadata_content = metadata_stream.read().decode("utf-8") metadata_stream.close() @@ -1222,7 +1224,7 @@ def test_retrieve_metadata_format_id_empty(store): def test_retrieve_metadata_format_id_empty_spaces(store): - """Test retrieve_metadata raises error when supplied with empty spaces asthe format_id.""" + """Test retrieve_metadata raises error when supplied with empty spaces as the format_id.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): @@ -1238,7 +1240,7 @@ def test_delete_object_object_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store._count("objects") == 0 @@ -1253,7 +1255,7 @@ def test_delete_object_metadata_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store._count("metadata") == 0 @@ -1267,7 +1269,7 @@ def test_delete_object_all_refs_files_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store._count("pid") == 0 assert store._count("cid") == 0 @@ -1282,7 +1284,7 @@ def test_delete_object_pid_refs_file_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) assert not os.path.exists(pid_refs_file_path) @@ -1297,7 +1299,7 @@ def test_delete_object_cid_refs_file_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) cid = object_metadata.cid store.delete_object(pid) cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) @@ -1343,7 +1345,7 @@ def test_delete_metadata(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_metadata(pid, format_id) assert store._count(entity) == 0 @@ -1359,9 +1361,9 @@ def test_delete_metadata_one_pid_multiple_metadata_documents(store): format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" - _metadata_cid = store.store_metadata(pid, syspath, format_id) - _metadata_cid3 = store.store_metadata(pid, syspath, format_id3) - _metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) + _stored_metadata_path4 = store.store_metadata(pid, syspath, format_id4) store.delete_metadata(pid) assert store._count(entity) == 0 @@ -1377,9 +1379,9 @@ def test_delete_metadata_specific_pid_multiple_metadata_documents(store): format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" - _metadata_cid = store.store_metadata(pid, syspath, format_id) - _metadata_cid3 = store.store_metadata(pid, syspath, format_id3) - _metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) + _stored_metadata_path4 = store.store_metadata(pid, syspath, format_id4) store.delete_metadata(pid, format_id4) assert store._count(entity) == 2 @@ -1401,7 +1403,7 @@ def test_delete_metadata_default_format_id(store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath) + _stored_metadata_path = store.store_metadata(pid, syspath) store.delete_metadata(pid) assert store._count(entity) == 0 @@ -1439,7 +1441,7 @@ def test_get_hex_digest(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) From 742eb883a0aa5ad306851cd54596ebaf61903dd2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 12:59:12 -0700 Subject: [PATCH 375/420] Review and clean-up 'retrieve_object' pytests --- tests/filehashstore/test_filehashstore_interface.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index b87dc0f4..1e5fd213 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1137,13 +1137,9 @@ def test_store_metadata_thread_lock(store): def test_retrieve_object(pids, store): """Test retrieve_object returns a stream to the correct object data.""" test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store._computehash(obj_stream) obj_stream.close() From 6a1b072ace6fc6dc63f933e6bbd293dd4f3db07e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:01:27 -0700 Subject: [PATCH 376/420] Review and clean-up 'retrieve_metadata' pytests --- tests/filehashstore/test_filehashstore_interface.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 1e5fd213..528bf972 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1166,10 +1166,8 @@ def test_retrieve_metadata(store): test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "jtao.1700.1" - path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) _stored_metadata_path = store.store_metadata(pid, syspath, format_id) metadata_stream = store.retrieve_metadata(pid, format_id) metadata_content = metadata_stream.read().decode("utf-8") @@ -1182,10 +1180,8 @@ def test_retrieve_metadata_default_format_id(store): """Test retrieve_metadata retrieves expected metadata without a format_id.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" - path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) _stored_metadata_path = store.store_metadata(pid, syspath) metadata_stream = store.retrieve_metadata(pid) metadata_content = metadata_stream.read().decode("utf-8") @@ -1195,16 +1191,15 @@ def test_retrieve_metadata_default_format_id(store): def test_retrieve_metadata_bytes_pid_invalid(store): - """Test retrieve_metadata raises error when supplied with bad pid.""" + """Test retrieve_metadata raises exception when supplied with pid with no system metadata.""" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - pid = "jtao.1700.1" - pid_does_not_exist = pid + "test" + pid_does_not_exist = "jtao.1700.1.metadata.does.not.exist" with pytest.raises(ValueError): store.retrieve_metadata(pid_does_not_exist, format_id) def test_retrieve_metadata_bytes_pid_empty(store): - """Test retrieve_metadata raises error when supplied with empty pid.""" + """Test retrieve_metadata raises exception when supplied with empty pid.""" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " with pytest.raises(ValueError): @@ -1220,7 +1215,7 @@ def test_retrieve_metadata_format_id_empty(store): def test_retrieve_metadata_format_id_empty_spaces(store): - """Test retrieve_metadata raises error when supplied with empty spaces as the format_id.""" + """Test retrieve_metadata raises exception when supplied with empty spaces as the format_id.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): From 1d5b21574ae59507e08ea616022e0301f2caf69a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:04:24 -0700 Subject: [PATCH 377/420] Review and clean-up 'delete_object' pytests --- tests/filehashstore/test_filehashstore_interface.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 528bf972..31caaf87 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1237,8 +1237,7 @@ def test_delete_object_object_deleted(pids, store): def test_delete_object_metadata_deleted(pids, store): - """Test delete_object successfully deletes relevant metadata - files and refs files.""" + """Test delete_object successfully deletes associated metadata files.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): @@ -1251,7 +1250,7 @@ def test_delete_object_metadata_deleted(pids, store): assert store._count("metadata") == 0 -def test_delete_object_all_refs_files_deleted(pids, store): +def test_delete_object_refs_files_deleted(pids, store): """Test delete_object successfully deletes refs files.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1305,11 +1304,11 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): object_metadata = store.store_object(pid, path) cid = object_metadata.cid cid_refs_abs_path = store._get_hashstore_cid_refs_path(cid) - # pylint: disable=W0212 store._update_refs_file(cid_refs_abs_path, "dou.test.1", "add") store.delete_object(pid) cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) assert os.path.exists(cid_refs_file_path) + assert store._count("cid") == 3 def test_delete_object_pid_empty(store): From cf1f11c815f0342f519cbfd6235d7b8f01a29f57 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:08:24 -0700 Subject: [PATCH 378/420] Review and clean-up 'delete_metadata' pytests --- .../test_filehashstore_interface.py | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 31caaf87..236ed6f9 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1328,21 +1328,18 @@ def test_delete_object_pid_none(store): def test_delete_metadata(pids, store): """Test delete_metadata successfully deletes metadata.""" test_dir = "tests/testdata/" - entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_metadata(pid, format_id) - assert store._count(entity) == 0 + assert store._count("metadata") == 0 def test_delete_metadata_one_pid_multiple_metadata_documents(store): """Test delete_metadata for a pid with multiple metadata documents deletes - all metadata files as expected.""" + all associated metadata files as expected.""" test_dir = "tests/testdata/" entity = "metadata" pid = "jtao.1700.1" @@ -1369,11 +1366,13 @@ def test_delete_metadata_specific_pid_multiple_metadata_documents(store): format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" - _stored_metadata_path = store.store_metadata(pid, syspath, format_id) - _stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) _stored_metadata_path4 = store.store_metadata(pid, syspath, format_id4) store.delete_metadata(pid, format_id4) assert store._count(entity) == 2 + assert os.path.exists(stored_metadata_path) + assert os.path.exists(stored_metadata_path3) def test_delete_metadata_does_not_exist(pids, store): @@ -1387,19 +1386,16 @@ def test_delete_metadata_does_not_exist(pids, store): def test_delete_metadata_default_format_id(store, pids): """Test delete_metadata deletes successfully with default format_id.""" test_dir = "tests/testdata/" - entity = "metadata" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) _stored_metadata_path = store.store_metadata(pid, syspath) store.delete_metadata(pid) - assert store._count(entity) == 0 + assert store._count("metadata") == 0 def test_delete_metadata_pid_empty(store): - """Test delete_object raises error when empty pid supplied.""" + """Test delete_metadata raises error when empty pid supplied.""" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " with pytest.raises(ValueError): @@ -1407,7 +1403,7 @@ def test_delete_metadata_pid_empty(store): def test_delete_metadata_pid_none(store): - """Test delete_object raises error when pid is 'None'.""" + """Test delete_metadata raises error when pid is 'None'.""" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = None with pytest.raises(ValueError): @@ -1415,7 +1411,7 @@ def test_delete_metadata_pid_none(store): def test_delete_metadata_format_id_empty(store): - """Test delete_object raises error when empty format_id supplied.""" + """Test delete_metadata raises error when empty format_id supplied.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): From 56a43155be9c1346a81adcdfefa5ba91a9450d4a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:11:36 -0700 Subject: [PATCH 379/420] Review and clean-up 'get_hex_digest' pytests --- tests/filehashstore/test_filehashstore_interface.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 236ed6f9..791d9d82 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1421,13 +1421,9 @@ def test_delete_metadata_format_id_empty(store): def test_get_hex_digest(store): """Test get_hex_digest for expected value.""" test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "jtao.1700.1" path = test_dir + pid - filename = pid + ".xml" - syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _stored_metadata_path = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) From 9ea95726b4a674e95b8396ed3ec1a5a997a78324 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:26:32 -0700 Subject: [PATCH 380/420] Fix bug in 'find_object' where sysmeta path was not returned due to incorrect method call (isdir vs. exists), and clean-up & review find_object pytests --- src/hashstore/filehashstore.py | 2 +- tests/filehashstore/test_filehashstore.py | 234 ++++++++++++---------- 2 files changed, 129 insertions(+), 107 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 588d3043..503e0236 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1237,7 +1237,7 @@ def _find_object(self, pid): "pid_refs_path": pid_ref_abs_path, "sysmeta_path": ( sysmeta_full_path - if os.path.isdir(sysmeta_full_path) + if os.path.exists(sysmeta_full_path) else "Does not exist." ), } diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index ec02eaeb..3bcf5a70 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -42,8 +42,9 @@ def test_init_directories_created(store): def test_init_existing_store_incorrect_algorithm_format(store): - """Confirm that exception is thrown when store_algorithm is not a DataONE - controlled value.""" + """Confirm that exception is thrown when store_algorithm is not a DataONE controlled value ( + the string must exactly match the expected format). DataONE uses the library of congress + vocabulary to standardize algorithm types.""" properties = { "store_path": store.root + "/incorrect_algo_format", "store_depth": 3, @@ -180,7 +181,6 @@ def test_validate_properties(store): "store_algorithm": "SHA-256", "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } - # pylint: disable=W0212 assert store._validate_properties(properties) @@ -193,7 +193,6 @@ def test_validate_properties_missing_key(store): "store_algorithm": "SHA-256", } with pytest.raises(KeyError): - # pylint: disable=W0212 store._validate_properties(properties) @@ -207,7 +206,6 @@ def test_validate_properties_key_value_is_none(store): "store_metadata_namespace": None, } with pytest.raises(ValueError): - # pylint: disable=W0212 store._validate_properties(properties) @@ -215,7 +213,6 @@ def test_validate_properties_incorrect_type(store): """Confirm exception raised when a bad properties value is given.""" properties = "etc/filehashstore/hashstore.yaml" with pytest.raises(ValueError): - # pylint: disable=W0212 store._validate_properties(properties) @@ -228,7 +225,6 @@ def test_set_default_algorithms_missing_yaml(store, pids): store._store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): - # pylint: disable=W0212 store._set_default_algorithms() @@ -238,11 +234,10 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): """Test _store_and_validate_data accepts path object for the path arg.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - assert store._exists(entity, object_metadata.cid) + assert store._exists("objects", object_metadata.cid) def test_store_and_validate_data_files_string(pids, store): @@ -252,20 +247,19 @@ def test_store_and_validate_data_files_string(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - assert store._exists(entity, object_metadata.cid) + assert store._exists("objects", object_metadata.cid) def test_store_and_validate_data_files_stream(pids, store): """Test _store_and_validate_data accepts stream for the path arg.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") object_metadata = store._store_and_validate_data(pid, input_stream) input_stream.close() - assert store._exists(entity, object_metadata.cid) - assert store._count(entity) == 3 + assert store._exists("objects", object_metadata.cid) + assert store._count("objects") == 3 def test_store_and_validate_data_cid(pids, store): @@ -373,6 +367,127 @@ def test_store_data_only_hex_digests(pids, store): assert object_metadata.hex_digests.get("sha512") == pids[pid]["sha512"] +def test_find_object_no_sysmeta(pids, store): + """Test _find_object returns the correct content and expected value for non-existent sysmeta.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + obj_info_dict = store._find_object(pid) + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert obj_info_dict["sysmeta_path"] == "Does not exist." + + +def test_find_object_sysmeta(pids, store): + """Test _find_object returns the correct content along with the sysmeta path""" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + + obj_info_dict = store._find_object(pid) + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert str(obj_info_dict["sysmeta_path"]) == stored_metadata_path + + +def test_find_object_refs_exist_but_obj_not_found(pids, store): + """Test _find_object throws exception when refs file exist but the object does not.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, path) + + cid = store._find_object(pid).get("cid") + obj_path = store._get_hashstore_data_object_path(cid) + os.remove(obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store._find_object(pid) + + +def test_find_object_cid_refs_not_found(pids, store): + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not exist.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + # Place the wrong cid into the pid refs file that has already been created + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + pid_ref_file.seek(0) + pid_ref_file.write("intentionally.wrong.pid") + pid_ref_file.truncate() + + with pytest.raises(OrphanPidRefsFileFound): + store._find_object(pid) + + +def test_find_object_cid_refs_does_not_contain_pid(pids, store): + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not contain the pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._get_hashstore_cid_refs_path( + object_metadata.hex_digests.get("sha256") + ) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store._find_object(pid) + + +def test_find_object_pid_refs_not_found(store): + """Test _find_object throws exception when a pid refs file does not exist.""" + with pytest.raises(PidRefsDoesNotExist): + store._find_object("dou.test.1") + + +def test_find_object_pid_none(store): + """Test _find_object throws exception when pid is None.""" + with pytest.raises(ValueError): + store._find_object(None) + + +def test_find_object_pid_empty(store): + """Test _find_object throws exception when pid is empty.""" + with pytest.raises(ValueError): + store._find_object("") + + def test_move_and_get_checksums_id(pids, store): """Test _move_and_get_checksums returns correct id.""" test_dir = "tests/testdata/" @@ -1250,99 +1365,6 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi store._verify_hashstore_references(pid, cid) -def test_find_object(pids, store): - """Test _find_object returns the correct content.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - obj_info_dict = store._find_object(pid) - retrieved_cid = obj_info_dict["cid"] - - assert retrieved_cid == object_metadata.hex_digests.get("sha256") - - data_object_path = store._get_hashstore_data_object_path(retrieved_cid) - assert data_object_path == obj_info_dict["cid_object_path"] - - cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) - assert cid_refs_path == obj_info_dict["cid_refs_path"] - - pid_refs_path = store._get_hashstore_pid_refs_path(pid) - assert pid_refs_path == obj_info_dict["pid_refs_path"] - - assert obj_info_dict["sysmeta_path"] == "Does not exist." - - -def test_find_object_refs_exist_but_obj_not_found(pids, store): - """Test _find_object throws exception when refs file exist but the object does not.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - store.store_object(pid, path) - - cid = store._find_object(pid).get("cid") - obj_path = store._get_hashstore_data_object_path(cid) - os.remove(obj_path) - - with pytest.raises(RefsFileExistsButCidObjMissing): - store._find_object(pid) - - -def test_find_object_cid_refs_not_found(pids, store): - """Test _find_object throws exception when pid refs file is found (and contains a cid) - but the cid refs file does not exist.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) - - # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) - with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: - pid_ref_file.seek(0) - pid_ref_file.write("intentionally.wrong.pid") - pid_ref_file.truncate() - - with pytest.raises(OrphanPidRefsFileFound): - store._find_object(pid) - - -def test_find_object_cid_refs_does_not_contain_pid(pids, store): - """Test _find_object throws exception when pid refs file is found (and contains a cid) - but the cid refs file does not contain the pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - - # Remove the pid from the cid refs file - cid_ref_abs_path = store._get_hashstore_cid_refs_path( - object_metadata.hex_digests.get("sha256") - ) - store._update_refs_file(cid_ref_abs_path, pid, "remove") - - with pytest.raises(PidNotFoundInCidRefsFile): - store._find_object(pid) - - -def test_find_object_pid_refs_not_found(store): - """Test _find_object throws exception when a pid refs file does not exist.""" - with pytest.raises(PidRefsDoesNotExist): - store._find_object("dou.test.1") - - -def test_find_object_pid_none(store): - """Test _find_object throws exception when pid is None.""" - with pytest.raises(ValueError): - store._find_object(None) - - -def test_find_object_pid_empty(store): - """Test _find_object throws exception when pid is empty.""" - with pytest.raises(ValueError): - store._find_object("") - - def test_clean_algorithm(store): """Check that algorithm values get formatted as expected.""" algorithm_underscore = "sha_256" From 2b0346a6422dbeef82a36b9e45705b813a40328f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:31:45 -0700 Subject: [PATCH 381/420] Re-organize 'find_object' pytests and filehashstore code placement --- src/hashstore/filehashstore.py | 104 +++++----- tests/filehashstore/test_filehashstore.py | 241 +++++++++++----------- 2 files changed, 172 insertions(+), 173 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 503e0236..b34a8194 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1129,58 +1129,6 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def _store_and_validate_data( - self, - pid, - file, - additional_algorithm=None, - checksum=None, - checksum_algorithm=None, - file_size_to_validate=None, - ): - """Store contents of `file` on disk, validate the object's parameters if provided, - and tag/reference the object. - - :param str pid: Authority-based identifier. - :param mixed file: Readable object or path to file. - :param str additional_algorithm: Optional algorithm value to include when returning - hex digests. - :param str checksum: Optional checksum to validate object against hex digest before moving - to permanent location. - :param str checksum_algorithm: Algorithm value of the given checksum. - :param int file_size_to_validate: Expected size of the object. - - :return: ObjectMetadata - object that contains the object id, object file size, - and hex digest dictionary. - """ - stream = Stream(file) - - logging.debug( - "FileHashStore - put_object: Request to put object for pid: %s", pid - ) - with closing(stream): - ( - object_cid, - obj_file_size, - hex_digest_dict, - ) = self._move_and_get_checksums( - pid, - stream, - additional_algorithm, - checksum, - checksum_algorithm, - file_size_to_validate, - ) - - object_metadata = ObjectMetadata( - pid, object_cid, obj_file_size, hex_digest_dict - ) - logging.debug( - "FileHashStore - put_object: Successfully put object for pid: %s", - pid, - ) - return object_metadata - def _find_object(self, pid): """Check if an object referenced by a pid exists and retrieve its content identifier. The `find_object` method validates the existence of an object based on the provided @@ -1266,6 +1214,58 @@ def _find_object(self, pid): logging.error(err_msg) raise PidRefsDoesNotExist(err_msg) + def _store_and_validate_data( + self, + pid, + file, + additional_algorithm=None, + checksum=None, + checksum_algorithm=None, + file_size_to_validate=None, + ): + """Store contents of `file` on disk, validate the object's parameters if provided, + and tag/reference the object. + + :param str pid: Authority-based identifier. + :param mixed file: Readable object or path to file. + :param str additional_algorithm: Optional algorithm value to include when returning + hex digests. + :param str checksum: Optional checksum to validate object against hex digest before moving + to permanent location. + :param str checksum_algorithm: Algorithm value of the given checksum. + :param int file_size_to_validate: Expected size of the object. + + :return: ObjectMetadata - object that contains the object id, object file size, + and hex digest dictionary. + """ + stream = Stream(file) + + logging.debug( + "FileHashStore - put_object: Request to put object for pid: %s", pid + ) + with closing(stream): + ( + object_cid, + obj_file_size, + hex_digest_dict, + ) = self._move_and_get_checksums( + pid, + stream, + additional_algorithm, + checksum, + checksum_algorithm, + file_size_to_validate, + ) + + object_metadata = ObjectMetadata( + pid, object_cid, obj_file_size, hex_digest_dict + ) + logging.debug( + "FileHashStore - put_object: Successfully put object for pid: %s", + pid, + ) + return object_metadata + def _store_data_only(self, data): """Store an object to HashStore and return a metadata object containing the content identifier, object file size and hex digests dictionary of the default algorithms. This diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 3bcf5a70..ed77839b 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -231,6 +231,126 @@ def test_set_default_algorithms_missing_yaml(store, pids): # Tests for FileHashStore Core Methods +def test_find_object_no_sysmeta(pids, store): + """Test _find_object returns the correct content and expected value for non-existent sysmeta.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + obj_info_dict = store._find_object(pid) + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert obj_info_dict["sysmeta_path"] == "Does not exist." + + +def test_find_object_sysmeta(pids, store): + """Test _find_object returns the correct content along with the sysmeta path""" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + + obj_info_dict = store._find_object(pid) + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert str(obj_info_dict["sysmeta_path"]) == stored_metadata_path + + +def test_find_object_refs_exist_but_obj_not_found(pids, store): + """Test _find_object throws exception when refs file exist but the object does not.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, path) + + cid = store._find_object(pid).get("cid") + obj_path = store._get_hashstore_data_object_path(cid) + os.remove(obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store._find_object(pid) + + +def test_find_object_cid_refs_not_found(pids, store): + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not exist.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + # Place the wrong cid into the pid refs file that has already been created + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + pid_ref_file.seek(0) + pid_ref_file.write("intentionally.wrong.pid") + pid_ref_file.truncate() + + with pytest.raises(OrphanPidRefsFileFound): + store._find_object(pid) + + +def test_find_object_cid_refs_does_not_contain_pid(pids, store): + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not contain the pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._get_hashstore_cid_refs_path( + object_metadata.hex_digests.get("sha256") + ) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store._find_object(pid) + + +def test_find_object_pid_refs_not_found(store): + """Test _find_object throws exception when a pid refs file does not exist.""" + with pytest.raises(PidRefsDoesNotExist): + store._find_object("dou.test.1") + + +def test_find_object_pid_none(store): + """Test _find_object throws exception when pid is None.""" + with pytest.raises(ValueError): + store._find_object(None) + + +def test_find_object_pid_empty(store): + """Test _find_object throws exception when pid is empty.""" + with pytest.raises(ValueError): + store._find_object("") + def test_store_and_validate_data_files_path(pids, store): """Test _store_and_validate_data accepts path object for the path arg.""" test_dir = "tests/testdata/" @@ -367,127 +487,6 @@ def test_store_data_only_hex_digests(pids, store): assert object_metadata.hex_digests.get("sha512") == pids[pid]["sha512"] -def test_find_object_no_sysmeta(pids, store): - """Test _find_object returns the correct content and expected value for non-existent sysmeta.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - obj_info_dict = store._find_object(pid) - retrieved_cid = obj_info_dict["cid"] - - assert retrieved_cid == object_metadata.hex_digests.get("sha256") - - data_object_path = store._get_hashstore_data_object_path(retrieved_cid) - assert data_object_path == obj_info_dict["cid_object_path"] - - cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) - assert cid_refs_path == obj_info_dict["cid_refs_path"] - - pid_refs_path = store._get_hashstore_pid_refs_path(pid) - assert pid_refs_path == obj_info_dict["pid_refs_path"] - - assert obj_info_dict["sysmeta_path"] == "Does not exist." - - -def test_find_object_sysmeta(pids, store): - """Test _find_object returns the correct content along with the sysmeta path""" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - stored_metadata_path = store.store_metadata(pid, syspath, format_id) - - obj_info_dict = store._find_object(pid) - retrieved_cid = obj_info_dict["cid"] - - assert retrieved_cid == object_metadata.hex_digests.get("sha256") - - data_object_path = store._get_hashstore_data_object_path(retrieved_cid) - assert data_object_path == obj_info_dict["cid_object_path"] - - cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) - assert cid_refs_path == obj_info_dict["cid_refs_path"] - - pid_refs_path = store._get_hashstore_pid_refs_path(pid) - assert pid_refs_path == obj_info_dict["pid_refs_path"] - - assert str(obj_info_dict["sysmeta_path"]) == stored_metadata_path - - -def test_find_object_refs_exist_but_obj_not_found(pids, store): - """Test _find_object throws exception when refs file exist but the object does not.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - store.store_object(pid, path) - - cid = store._find_object(pid).get("cid") - obj_path = store._get_hashstore_data_object_path(cid) - os.remove(obj_path) - - with pytest.raises(RefsFileExistsButCidObjMissing): - store._find_object(pid) - - -def test_find_object_cid_refs_not_found(pids, store): - """Test _find_object throws exception when pid refs file is found (and contains a cid) - but the cid refs file does not exist.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) - - # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) - with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: - pid_ref_file.seek(0) - pid_ref_file.write("intentionally.wrong.pid") - pid_ref_file.truncate() - - with pytest.raises(OrphanPidRefsFileFound): - store._find_object(pid) - - -def test_find_object_cid_refs_does_not_contain_pid(pids, store): - """Test _find_object throws exception when pid refs file is found (and contains a cid) - but the cid refs file does not contain the pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - - # Remove the pid from the cid refs file - cid_ref_abs_path = store._get_hashstore_cid_refs_path( - object_metadata.hex_digests.get("sha256") - ) - store._update_refs_file(cid_ref_abs_path, pid, "remove") - - with pytest.raises(PidNotFoundInCidRefsFile): - store._find_object(pid) - - -def test_find_object_pid_refs_not_found(store): - """Test _find_object throws exception when a pid refs file does not exist.""" - with pytest.raises(PidRefsDoesNotExist): - store._find_object("dou.test.1") - - -def test_find_object_pid_none(store): - """Test _find_object throws exception when pid is None.""" - with pytest.raises(ValueError): - store._find_object(None) - - -def test_find_object_pid_empty(store): - """Test _find_object throws exception when pid is empty.""" - with pytest.raises(ValueError): - store._find_object("") - - def test_move_and_get_checksums_id(pids, store): """Test _move_and_get_checksums returns correct id.""" test_dir = "tests/testdata/" From cdf377b356fc894fe00a49772c0a7b7c44eb29a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:49:08 -0700 Subject: [PATCH 382/420] Cleanup 'filehashstore' pytests module part 1 via reorganization and deleting redundant comments and code --- src/hashstore/filehashstore.py | 7 +- tests/filehashstore/test_filehashstore.py | 403 +++++++++++----------- 2 files changed, 196 insertions(+), 214 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b34a8194..4cfe9ee0 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1327,12 +1327,11 @@ def _move_and_get_checksums( checksum_algorithm=None, file_size_to_validate=None, ): - """Copy the contents of `stream` onto disk with an optional file - extension appended. The copy process uses a temporary file to store the - initial contents and returns a dictionary of algorithms and their + """Copy the contents of the `Stream` object onto disk. The copy process uses a temporary + file to store the initial contents and returns a dictionary of algorithms and their hex digest values. If the file already exists, the method will immediately raise an exception. If an algorithm and checksum are provided, it will proceed to - validate the object (and delete the tmpFile if the hex digest stored does + validate the object (and delete the temporary file created if the hex digest stored does not match what is provided). :param Optional[str] pid: Authority-based identifier. diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index ed77839b..a457d01e 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -351,6 +351,7 @@ def test_find_object_pid_empty(store): with pytest.raises(ValueError): store._find_object("") + def test_store_and_validate_data_files_path(pids, store): """Test _store_and_validate_data accepts path object for the path arg.""" test_dir = "tests/testdata/" @@ -493,7 +494,6 @@ def test_move_and_get_checksums_id(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 ( move_id, _, @@ -509,7 +509,6 @@ def test_move_and_get_checksums_file_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 ( _, tmp_file_size, @@ -525,7 +524,6 @@ def test_move_and_get_checksums_hex_digests(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 ( _, _, @@ -542,26 +540,22 @@ def test_move_and_get_checksums_hex_digests(pids, store): def test_move_and_get_checksums_does_not_store_duplicate(pids, store): """Test _move_and_get_checksums does not store duplicate objects.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 store._move_and_get_checksums(pid, input_stream) input_stream.close() for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 store._move_and_get_checksums(pid, input_stream) input_stream.close() - assert store._count(entity) == 3 + assert store._count("objects") == 3 def test_move_and_get_checksums_raises_error_with_nonmatching_checksum(pids, store): """Test _move_and_get_checksums raises error when incorrect checksum supplied.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") @@ -574,18 +568,17 @@ def test_move_and_get_checksums_raises_error_with_nonmatching_checksum(pids, sto checksum_algorithm="sha256", ) input_stream.close() - assert store._count(entity) == 0 + assert store._count("objects") == 0 def test_move_and_get_checksums_incorrect_file_size(pids, store): - """Test move and get checksum raises error with an incorrect file size.""" + """Test _move_and_get_checksums raises error with an incorrect file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): with pytest.raises(NonMatchingObjSize): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") incorrect_file_size = 1000 - # pylint: disable=W0212 (_, _, _, _,) = store._move_and_get_checksums( pid, input_stream, file_size_to_validate=incorrect_file_size ) @@ -602,7 +595,6 @@ def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=checksum_algo ) @@ -621,7 +613,6 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, checksum_algorithm=checksum_algo ) @@ -644,7 +635,6 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(stor checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, @@ -666,7 +656,6 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo_dupl additional_algo = "sha224" checksum_algo = "sha224" checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, @@ -682,7 +671,6 @@ def test_write_to_tmp_file_and_get_hex_digests_file_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 _, _, tmp_file_size = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert tmp_file_size == pids[pid]["file_size_bytes"] @@ -694,7 +682,6 @@ def test_write_to_tmp_file_and_get_hex_digests_hex_digests(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert hex_digests.get("md5") == pids[pid]["md5"] @@ -710,7 +697,6 @@ def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 _, tmp_file_name, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert os.path.isfile(tmp_file_name) is True @@ -724,12 +710,10 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, input_stream = io.open(path, "rb") algo = "md2" with pytest.raises(UnsupportedAlgorithm): - # pylint: disable=W0212 _, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=algo ) with pytest.raises(UnsupportedAlgorithm): - # pylint: disable=W0212 _, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, checksum_algorithm=algo ) @@ -740,7 +724,6 @@ def test_mktmpfile(store): """Test that _mktmpfile creates and returns a tmp file.""" path = store.root + "/doutest/tmp/" store._create_path(path) - # pylint: disable=W0212 tmp = store._mktmpfile(path) assert os.path.exists(tmp.name) @@ -849,6 +832,195 @@ def test_store_hashstore_refs_files_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 +def test_untag_object(pids, store): + """Test _untag_object untags successfully.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + assert store._count("objects") == 3 + + +def test_untag_object_pid_not_locked(pids, store): + """Test _untag_object throws exception when pid is not locked""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + with pytest.raises(IdentifierNotLocked): + store._untag_object(pid, cid) + + +def test_untag_object_cid_not_locked(pids, store): + """Test _untag_object throws exception with cid is not locked""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + with pytest.raises(IdentifierNotLocked): + store._synchronize_referenced_locked_pids(pid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + + +def test_untag_object_orphan_pid_refs_file_found(store): + """Test _untag_object removes an orphan pid refs file""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + # Remove cid refs file + cid_refs_abs_path = store._get_hashstore_cid_refs_path(cid) + os.remove(cid_refs_abs_path) + + with pytest.raises(OrphanPidRefsFileFound): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + + +def test_untag_object_orphan_refs_exist_but_data_object_not_found(store): + """Test _untag_object removes orphaned pid and cid refs files""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + # Remove cid refs file + data_obj_path = store._get_hashstore_data_object_path(cid) + os.remove(data_obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + + +def test_untag_object_refs_found_but_pid_not_in_cid_refs(store): + """Test _untag_object removes pid refs file whose pid is not found in the cid refs file.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + pid_two = pid + ".dou" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + _object_metadata_two = store.store_object(pid_two, path) + cid = object_metadata.cid + + assert store._count("pid") == 2 + assert store._count("cid") == 1 + + # Remove pid from cid refs + cid_refs_file = store._get_hashstore_cid_refs_path(cid) + # First remove the pid + store._update_refs_file(cid_refs_file, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + +def test_untag_object_pid_refs_file_does_not_exist(store): + """Test _untag_object removes pid from cid refs file since the pid refs file does not exist, + and does not delete the cid refs file because a reference is still present.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + pid_two = pid + ".dou" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + _object_metadata_two = store.store_object(pid_two, path) + cid = object_metadata.cid + + assert store._count("pid") == 2 + assert store._count("cid") == 1 + + # Remove pid from cid refs + pid_refs_file = store._get_hashstore_pid_refs_path(pid) + os.remove(pid_refs_file) + + with pytest.raises(PidRefsDoesNotExist): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + +def test_untag_object_pid_refs_file_does_not_exist_and_cid_refs_is_empty(store): + """Test '_untag_object' removes pid from cid refs file since the pid refs file does not exist, + and deletes the cid refs file because it contains no more references (after the pid called + with '_untag_object' is removed from the cid refs).""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + # Remove pid from cid refs + pid_refs_file = store._get_hashstore_pid_refs_path(pid) + os.remove(pid_refs_file) + + with pytest.raises(PidRefsDoesNotExist): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + + def test_delete_marked_files(store): """Test that _delete_marked_files removes all items from a given list""" pid = "jtao.1700.1" @@ -1527,195 +1699,6 @@ def test_delete_with_object_metadata_id(pids, store): assert store._count(entity) == 0 -def test_untag_object(pids, store): - """Test _untag_object untags successfully.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 0 - assert store._count("cid") == 0 - assert store._count("objects") == 3 - - -def test_untag_object_pid_not_locked(pids, store): - """Test _untag_object throws exception when pid is not locked""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - with pytest.raises(IdentifierNotLocked): - store._untag_object(pid, cid) - - -def test_untag_object_cid_not_locked(pids, store): - """Test _untag_object throws exception with cid is not locked""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - with pytest.raises(IdentifierNotLocked): - store._synchronize_referenced_locked_pids(pid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - - -def test_untag_object_orphan_pid_refs_file_found(store): - """Test _untag_object removes an orphan pid refs file""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - # Remove cid refs file - cid_refs_abs_path = store._get_hashstore_cid_refs_path(cid) - os.remove(cid_refs_abs_path) - - with pytest.raises(OrphanPidRefsFileFound): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 0 - - -def test_untag_object_orphan_refs_exist_but_data_object_not_found(store): - """Test _untag_object removes orphaned pid and cid refs files""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - # Remove cid refs file - data_obj_path = store._get_hashstore_data_object_path(cid) - os.remove(data_obj_path) - - with pytest.raises(RefsFileExistsButCidObjMissing): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 0 - assert store._count("cid") == 0 - - -def test_untag_object_refs_found_but_pid_not_in_cid_refs(store): - """Test _untag_object removes pid refs file whose pid is not found in the cid refs file.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - pid_two = pid + ".dou" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - _object_metadata_two = store.store_object(pid_two, path) - cid = object_metadata.cid - - assert store._count("pid") == 2 - assert store._count("cid") == 1 - - # Remove pid from cid refs - cid_refs_file = store._get_hashstore_cid_refs_path(cid) - # First remove the pid - store._update_refs_file(cid_refs_file, pid, "remove") - - with pytest.raises(PidNotFoundInCidRefsFile): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - -def test_untag_object_pid_refs_file_does_not_exist(store): - """Test _untag_object removes pid from cid refs file since the pid refs file does not exist, - and does not delete the cid refs file because a reference is still present.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - pid_two = pid + ".dou" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - _object_metadata_two = store.store_object(pid_two, path) - cid = object_metadata.cid - - assert store._count("pid") == 2 - assert store._count("cid") == 1 - - # Remove pid from cid refs - pid_refs_file = store._get_hashstore_pid_refs_path(pid) - os.remove(pid_refs_file) - - with pytest.raises(PidRefsDoesNotExist): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - -def test_untag_object_pid_refs_file_does_not_exist_and_cid_refs_is_empty(store): - """Test '_untag_object' removes pid from cid refs file since the pid refs file does not exist, - and deletes the cid refs file because it contains no more references (after the pid called - with '_untag_object' is removed from the cid refs).""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - # Remove pid from cid refs - pid_refs_file = store._get_hashstore_pid_refs_path(pid) - os.remove(pid_refs_file) - - with pytest.raises(PidRefsDoesNotExist): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 0 - assert store._count("cid") == 0 - - def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: From b9d216cb735c1a7b75441db52e41f3ef4a2938ca Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:54:07 -0700 Subject: [PATCH 383/420] Cleanup remaining 'filehashstore' pytests module for core methods --- tests/filehashstore/test_filehashstore.py | 128 +++++++++++----------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index a457d01e..304f3d06 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1021,6 +1021,64 @@ def test_untag_object_pid_refs_file_does_not_exist_and_cid_refs_is_empty(store): assert store._count("cid") == 0 +def test_put_metadata_with_path(pids, store): + """Test _put_metadata with path object for the path arg.""" + entity = "metadata" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_stored_path = store._put_metadata(syspath, pid, format_id) + assert store._exists(entity, metadata_stored_path) + assert store._count(entity) == 3 + + +def test_put_metadata_with_string(pids, store): + """Test_put metadata with string for the path arg.""" + entity = "metadata" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = str(Path(test_dir) / filename) + metadata_stored_path = store._put_metadata(syspath, pid, format_id) + assert store._exists(entity, metadata_stored_path) + assert store._count(entity) == 3 + + +def test_put_metadata_stored_path(pids, store): + """Test put metadata returns correct path to the metadata stored.""" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + metadata_document_name = store._computehash(pid + format_id) + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_stored_path = store._put_metadata(syspath, pid, metadata_document_name) + + # Manually calculate expected path + metadata_directory = store._computehash(pid) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_stored_path == full_path + + +def test_mktmpmetadata(pids, store): + """Test mktmpmetadata creates tmpFile.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + sys_stream = io.open(syspath, "rb") + # pylint: disable=W0212 + tmp_name = store._mktmpmetadata(sys_stream) + sys_stream.close() + assert os.path.exists(tmp_name) + + def test_delete_marked_files(store): """Test that _delete_marked_files removes all items from a given list""" pid = "jtao.1700.1" @@ -1251,64 +1309,6 @@ def test_update_refs_file_empty_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -def test_put_metadata_with_path(pids, store): - """Test _put_metadata with path object for the path arg.""" - entity = "metadata" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - metadata_cid = store._put_metadata(syspath, pid, format_id) - assert store._exists(entity, metadata_cid) - assert store._count(entity) == 3 - - -def test_put_metadata_with_string(pids, store): - """Test_put metadata with string for the path arg.""" - entity = "metadata" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - filename = pid.replace("/", "_") + ".xml" - syspath = str(Path(test_dir) / filename) - metadata_cid = store._put_metadata(syspath, pid, format_id) - assert store._exists(entity, metadata_cid) - assert store._count(entity) == 3 - - -def test_put_metadata_cid(pids, store): - """Test put metadata returns correct id.""" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - metadata_document_name = store._computehash(pid + format_id) - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - metadata_cid = store._put_metadata(syspath, pid, metadata_document_name) - - # Manually calculate expected path - metadata_directory = store._computehash(pid) - rel_path = "/".join(store._shard(metadata_directory)) - full_path = ( - store._get_store_path("metadata") / rel_path / metadata_document_name - ) - assert metadata_cid == full_path - - -def test_mktmpmetadata(pids, store): - """Test mktmpmetadata creates tmpFile.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - sys_stream = io.open(syspath, "rb") - # pylint: disable=W0212 - tmp_name = store._mktmpmetadata(sys_stream) - sys_stream.close() - assert os.path.exists(tmp_name) - - # Tests for FileHashStore Utility & Supporting Methods @@ -1621,8 +1621,8 @@ def test_exists_metadata_files_path(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store._exists(entity, metadata_cid) + metadata_stored_path = store.store_metadata(pid, syspath, format_id) + assert store._exists(entity, metadata_stored_path) def test_exists_object_with_nonexistent_file(store): @@ -1680,7 +1680,7 @@ def test_delete_object_only_cid_refs_file_exists(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _metadata_stored_path = store.store_metadata(pid, syspath, format_id) store._delete_object_only(object_metadata.cid) assert store._count(entity) == 3 assert store._count("pid") == 3 @@ -1745,8 +1745,8 @@ def test_get_real_path_with_metadata_id(store, pids): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_abs_path = store._get_hashstore_metadata_path(metadata_cid) + metadata_stored_path = store.store_metadata(pid, syspath, format_id) + metadata_abs_path = store._get_hashstore_metadata_path(metadata_stored_path) assert os.path.exists(metadata_abs_path) @@ -1800,7 +1800,7 @@ def test_get_hashstore_metadata_path_metadata(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _metadata_stored_path = store.store_metadata(pid, syspath, format_id) metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) From 15825050ef06fe3262432f182e7aaf151e17ad8e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 14:08:31 -0700 Subject: [PATCH 384/420] Cleanuo 'filehashstore' utility and supporting methods pytests part 1 and add new pytest for '_is_string_in_refs_file' --- tests/filehashstore/test_filehashstore.py | 30 +++++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 304f3d06..2d34ccf0 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1079,6 +1079,9 @@ def test_mktmpmetadata(pids, store): assert os.path.exists(tmp_name) +# Tests for FileHashStore Utility & Supporting Methods + + def test_delete_marked_files(store): """Test that _delete_marked_files removes all items from a given list""" pid = "jtao.1700.1" @@ -1183,14 +1186,14 @@ def test_validate_and_check_cid_lock_identifier_not_locked(store): def test_write_refs_file_ref_type_cid(store): - """Test that write_refs_file writes a reference file.""" + """Test that write_refs_file writes a reference file when given a 'cid' update_type.""" tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "test_pid", "cid") assert os.path.exists(tmp_cid_refs_file) -def test_write_refs_file_ref_type_cid_content(pids, store): - """Test that write_refs_file writes the expected content.""" +def test_write_refs_file_ref_type_content_cid(pids, store): + """Test that write_refs_file writes the expected content when given a 'cid' update_type.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -1201,7 +1204,7 @@ def test_write_refs_file_ref_type_cid_content(pids, store): def test_write_refs_file_ref_type_pid(pids, store): - """Test that write_pid_refs_file writes a reference file.""" + """Test that write_pid_refs_file writes a reference file when given a 'pid' update_type.""" for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" @@ -1210,7 +1213,7 @@ def test_write_refs_file_ref_type_pid(pids, store): def test_write_refs_file_ref_type_content_pid(pids, store): - """Test that write_pid_refs_file writes the expected content.""" + """Test that write_refs_file writes the expected content when given a 'pid' update_type""" for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" @@ -1256,8 +1259,8 @@ def test_update_refs_file_content_multiple(pids, store): assert line_count == 6 -def test_update_refs_file_content_pid_exists(pids, store): - """Test that _update_refs_file does add a pid to a refs file that already +def test_update_refs_file_deduplicates_pid_already_found(pids, store): + """Test that _update_refs_file does not add a pid to a refs file that already contains the pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" @@ -1309,7 +1312,18 @@ def test_update_refs_file_empty_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -# Tests for FileHashStore Utility & Supporting Methods +def test_is_string_in_refs_file(pids, store): + """Test that _update_refs_file leaves a file empty when removing the last pid.""" + for pid in pids.keys(): + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + + cid_reference_list = [pid] + for i in range(0, 5): + store._update_refs_file(tmp_cid_refs_file, f"dou.test.{i}", "add") + cid_reference_list.append(f"dou.test.{i}") + + assert store._is_string_in_refs_file("dou.test.2", tmp_cid_refs_file) is True def test_verify_object_information(pids, store): From d58d5e85b7bc1efcf1411247d6bbc9dd72c0df5f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 23 Sep 2024 16:07:57 -0700 Subject: [PATCH 385/420] Revise exception messaging in '_verify_object_information', review pytests and add new pytest --- src/hashstore/filehashstore.py | 5 ++-- tests/conftest.py | 3 +++ tests/filehashstore/test_filehashstore.py | 31 ++++++++++++++++++----- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4cfe9ee0..c66aa686 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2169,8 +2169,9 @@ def _verify_object_information( exception_string = ( "FileHashStore - _verify_object_information: checksum_algorithm" + f" ({checksum_algorithm}) cannot be found in the default hex digests" - + " dict, but is supported. New checksum calculated but does not match" - + " what has been provided." + + f" dict, but is supported. New checksum calculated: " + f"{hex_digest_calculated}, does not match what has been provided: " + + checksum ) logging.debug(exception_string) raise NonMatchingChecksum(exception_string) diff --git a/tests/conftest.py b/tests/conftest.py index 27b1c8fa..e10a83e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -55,6 +55,7 @@ def init_pids(): "sha256": "4d198171eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c", "sha384": "d5953bd802fa74edea72eb941ead7a27639e62792fedc065d6c81de6c613b5b8739ab1f90e7f24a7500d154a727ed7c2", "sha512": "e9bcd6b91b102ef5803d1bd60c7a5d2dbec1a2baf5f62f7da60de07607ad6797d6a9b740d97a257fd2774f2c26503d455d8f2a03a128773477dfa96ab96a2e54", + "blake2s": "5895fa29c17f8768d613984bb86791e5fcade7643c15e84663c03be89205d81e", }, "jtao.1700.1": { "file_size_bytes": 8724, @@ -65,6 +66,7 @@ def init_pids(): "sha256": "94f9b6c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a", "sha384": "a204678330fcdc04980c9327d4e5daf01ab7541e8a351d49a7e9c5005439dce749ada39c4c35f573dd7d307cca11bea8", "sha512": "bf9e7f4d4e66bd082817d87659d1d57c2220c376cd032ed97cadd481cf40d78dd479cbed14d34d98bae8cebc603b40c633d088751f07155a94468aa59e2ad109", + "blake2s": "8978c46ee4cc5d1d79698752fd663c60c817d58d6aea901843bf4fc2cb173bef", }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { "file_size_bytes": 18699, @@ -75,6 +77,7 @@ def init_pids(): "sha256": "4473516a592209cbcd3a7ba4edeebbdb374ee8e4a49d19896fafb8f278dc25fa", "sha384": "b1023a9be5aa23a102be9bce66e71f1f1c7a6b6b03e3fc603e9cd36b4265671e94f9cc5ce3786879740536994489bc26", "sha512": "c7fac7e8aacde8546ddb44c640ad127df82830bba6794aea9952f737c13a81d69095865ab3018ed2a807bf9222f80657faf31cfde6c853d7b91e617e148fec76", + "blake2s": "c8c9aea2f7ddcfaf8db93ce95f18e467b6293660d1a0b08137636a3c92896765", }, } return test_pids diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 2d34ccf0..719e8325 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1336,7 +1336,6 @@ def test_verify_object_information(pids, store): checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size - # pylint: disable=W0212 store._verify_object_information( None, checksum, @@ -1359,7 +1358,6 @@ def test_verify_object_information_incorrect_size(pids, store): checksum = hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm with pytest.raises(NonMatchingObjSize): - # pylint: disable=W0212 store._verify_object_information( None, checksum, @@ -1385,7 +1383,6 @@ def test_verify_object_information_incorrect_size_with_pid(pids, store): expected_file_size = object_metadata.obj_size objects_tmp_folder = store.objects + "/tmp" - # pylint: disable=W0212 tmp_file = store._mktmpfile(objects_tmp_folder) assert os.path.isfile(tmp_file.name) with pytest.raises(NonMatchingObjSize): @@ -1415,7 +1412,6 @@ def test_verify_object_information_missing_key_in_hex_digests_unsupported_algo( checksum_algorithm = "md10" expected_file_size = object_metadata.obj_size with pytest.raises(UnsupportedAlgorithm): - # pylint: disable=W0212 store._verify_object_information( None, checksum, @@ -1432,7 +1428,7 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( pids, store ): """Test _verify_object_information throws exception when algorithm is not found - in hex digests but is supported, however the checksum calculated does not match.""" + in hex digests but is supported, and the checksum calculated does not match.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -1441,7 +1437,6 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( checksum_algorithm = "blake2s" expected_file_size = object_metadata.obj_size with pytest.raises(NonMatchingChecksum): - # pylint: disable=W0212 store._verify_object_information( None, checksum, @@ -1454,6 +1449,30 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( ) +def test_verify_object_information_missing_key_in_hex_digests_matching_checksum( + pids, store +): + """Test _verify_object_information does not throw exception when algorithm is not found + in hex digests but is supported, and the checksum calculated matches.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum_algorithm = "blake2s" + checksum = pids[pid][checksum_algorithm] + expected_file_size = object_metadata.obj_size + store._verify_object_information( + None, + checksum, + checksum_algorithm, + "objects", + object_metadata.hex_digests, + None, + expected_file_size, + expected_file_size, + ) + + def test_verify_hashstore_references_pid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when pid refs file is missing.""" for pid in pids.keys(): From d49cdfac8b5204e601d47bf672e6a858d09b5fa0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 23 Sep 2024 17:51:20 -0700 Subject: [PATCH 386/420] Refactor private '_delete' method and clean-up remaining pytests --- src/hashstore/filehashstore.py | 61 +++--- tests/filehashstore/test_filehashstore.py | 241 +++++++++++----------- 2 files changed, 150 insertions(+), 152 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c66aa686..3fabd431 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2572,7 +2572,11 @@ def _delete(self, entity, file): elif entity == "objects": realpath = self._get_hashstore_data_object_path(file) elif entity == "metadata": - realpath = self._get_hashstore_metadata_path(file) + try: + realpath = self._get_hashstore_metadata_path(file) + except FileNotFoundError: + # Swallow file not found exceptions for metadata + realpath = None elif os.path.exists(file): # Check if the given path is an absolute path realpath = file @@ -2580,13 +2584,10 @@ def _delete(self, entity, file): raise IOError( f"FileHashStore - delete(): Could not locate file: {file}" ) - except FileNotFoundError: - realpath = None - - try: if realpath is not None: os.remove(realpath) - except OSError as err: + + except Exception as err: exception_string = ( f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" ) @@ -2604,6 +2605,30 @@ def _create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" + def _get_store_path(self, entity): + """Return a path object to the root directory of the requested hashstore directory type + + :param str entity: Desired entity type: "objects", "metadata", "refs", "cid" and "pid". + Note, "cid" and "pid" are refs specific directories. + + :return: Path to requested store entity type + :rtype: Path + """ + if entity == "objects": + return Path(self.objects) + elif entity == "metadata": + return Path(self.metadata) + elif entity == "refs": + return Path(self.refs) + elif entity == "cid": + return Path(self.cids) + elif entity == "pid": + return Path(self.pids) + else: + raise ValueError( + f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" + ) + def _build_hashstore_data_object_path(self, hash_id): """Build the absolute file path for a given content identifier @@ -2699,30 +2724,6 @@ def _get_hashstore_cid_refs_path(self, cid): cid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) return cid_ref_file_abs_path - def _get_store_path(self, entity): - """Return a path object to the root directory of the requested hashstore directory type - - :param str entity: Desired entity type: "objects", "metadata", "refs", "cid" and "pid". - Note, "cid" and "pid" are refs specific directories. - - :return: Path to requested store entity type - :rtype: Path - """ - if entity == "objects": - return Path(self.objects) - elif entity == "metadata": - return Path(self.metadata) - elif entity == "refs": - return Path(self.refs) - elif entity == "cid": - return Path(self.cids) - elif entity == "pid": - return Path(self.pids) - else: - raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" - ) - # Synchronization Methods def _release_object_locked_pids(self, pid): diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 719e8325..4c150b7f 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1560,15 +1560,41 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - cid_reference_list = [pid] for i in range(0, 5): store._update_refs_file(cid_ref_abs_path, f"dou.test.{i}", "add") - cid_reference_list.append(f"dou.test.{i}") with pytest.raises(CidRefsContentError): store._verify_hashstore_references(pid, cid) +def test_delete_object_only(pids, store): + """Test _delete_object successfully deletes only object.""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid=None, data=path) + store._delete_object_only(object_metadata.cid) + assert store._count(entity) == 0 + + +def test_delete_object_only_cid_refs_file_exists(pids, store): + """Test _delete_object does not delete object if a cid refs file still exists.""" + test_dir = "tests/testdata/" + entity = "objects" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + _metadata_stored_path = store.store_metadata(pid, syspath, format_id) + store._delete_object_only(object_metadata.cid) + assert store._count(entity) == 3 + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + def test_clean_algorithm(store): """Check that algorithm values get formatted as expected.""" algorithm_underscore = "sha_256" @@ -1600,28 +1626,27 @@ def test_computehash(pids, store): assert pids[pid]["sha256"] == obj_sha256_hash -def test_get_store_path_object(store): - """Check get_store_path for object path.""" - # pylint: disable=W0212 - path_objects = store._get_store_path("objects") - path_objects_string = str(path_objects) - assert path_objects_string.endswith("/metacat/hashstore/objects") - - -def test_get_store_path_metadata(store): - """Check get_store_path for metadata path.""" - # pylint: disable=W0212 - path_metadata = store._get_store_path("metadata") - path_metadata_string = str(path_metadata) - assert path_metadata_string.endswith("/metacat/hashstore/metadata") +def test_shard(store): + """Test shard creates list.""" + hash_id = "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e" + predefined_list = [ + "0d", + "55", + "5e", + "d77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", + ] + sharded_list = store._shard(hash_id) + assert predefined_list == sharded_list -def test_get_store_path_refs(store): - """Check get_store_path for refs path.""" - # pylint: disable=W0212 - path_metadata = store._get_store_path("refs") - path_metadata_string = str(path_metadata) - assert path_metadata_string.endswith("/metacat/hashstore/refs") +def test_count(pids, store): + """Check that count returns expected number of objects.""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path_string = test_dir + pid.replace("/", "_") + store._store_and_validate_data(pid, path_string) + assert store._count(entity) == 3 def test_exists_object_with_object_metadata_id(pids, store): @@ -1666,19 +1691,6 @@ def test_exists_object_with_nonexistent_file(store): assert does_not_exist is False -def test_shard(store): - """Test shard creates list.""" - hash_id = "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e" - predefined_list = [ - "0d", - "55", - "5e", - "d77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", - ] - sharded_list = store._shard(hash_id) - assert predefined_list == sharded_list - - def test_open_objects(pids, store): """Test open returns a stream.""" test_dir = "tests/testdata/" @@ -1692,44 +1704,52 @@ def test_open_objects(pids, store): io_buffer.close() -def test_delete_object_only(pids, store): - """Test _delete_object successfully deletes only object.""" +def test_private_delete_objects(pids, store): + """Confirm _delete deletes for entity type 'objects'""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid=None, data=path) - store._delete_object_only(object_metadata.cid) - assert store._count(entity) == 0 + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + store._delete("objects", object_metadata.cid) + assert store._count("objects") == 0 -def test_delete_object_only_cid_refs_file_exists(pids, store): - """Test _delete_object does not delete object if a cid refs file still exists.""" + +def test_private_delete_metadata(pids, store): + """Confirm _delete deletes for entity type 'metadata'""" test_dir = "tests/testdata/" - entity = "objects" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - _metadata_stored_path = store.store_metadata(pid, syspath, format_id) - store._delete_object_only(object_metadata.cid) - assert store._count(entity) == 3 - assert store._count("pid") == 3 - assert store._count("cid") == 3 + store.store_metadata(pid, syspath, format_id) + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(pid + format_id) + rel_path = ( + Path("/".join(store._shard(metadata_directory))) / metadata_document_name + ) + + store._delete("metadata", rel_path) + + assert store._count("metadata") == 0 -def test_delete_with_object_metadata_id(pids, store): - """Check objects are deleted after calling delete with object id.""" + +def test_private_delete_absolute_path(pids, store): + """Confirm _delete deletes for absolute paths'""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store._store_and_validate_data(pid, path) - object_metadata_id = object_metadata.cid - store._delete(entity, object_metadata_id) - assert store._count(entity) == 0 + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + + cid_refs_path = store._get_hashstore_cid_refs_path(object_metadata.cid) + store._delete("other", cid_refs_path) + assert store._count("cid") == 0 + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + store._delete("other", pid_refs_path) + assert store._count("pid") == 0 def test_create_path(pids, store): @@ -1742,15 +1762,39 @@ def test_create_path(pids, store): assert os.path.isdir(pid_directory) -def test_get_real_path_file_does_not_exist(store): - """Test get_real_path returns None when object does not exist.""" +def test_get_store_path_object(store): + """Check get_store_path for object path.""" + # pylint: disable=W0212 + path_objects = store._get_store_path("objects") + path_objects_string = str(path_objects) + assert path_objects_string.endswith("/metacat/hashstore/objects") + + +def test_get_store_path_metadata(store): + """Check get_store_path for metadata path.""" + # pylint: disable=W0212 + path_metadata = store._get_store_path("metadata") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/hashstore/metadata") + + +def test_get_store_path_refs(store): + """Check get_store_path for refs path.""" + # pylint: disable=W0212 + path_metadata = store._get_store_path("refs") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/hashstore/refs") + + +def test_get_hashstore_data_object_path_file_does_not_exist(store): + """Test _get_hashstore_data_object_path returns None when object does not exist.""" test_path = "tests/testdata/helloworld.txt" with pytest.raises(FileNotFoundError): store._get_hashstore_data_object_path(test_path) -def test_get_real_path_with_object_id(store, pids): - """Test get_real_path returns absolute path given an object id.""" +def test_get_hashstore_data_object_path_with_object_id(store, pids): + """Test _get_hashstore_data_object_path returns absolute path given an object id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -1759,20 +1803,8 @@ def test_get_real_path_with_object_id(store, pids): assert os.path.exists(obj_abs_path) -def test_get_real_path_with_object_id_sharded(pids, store): - """Test exists method with a sharded path (relative path).""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store._store_and_validate_data(pid, path) - object_metadata_shard = store._shard(object_metadata.cid) - object_metadata_shard_path = "/".join(object_metadata_shard) - obj_abs_path = store._get_hashstore_data_object_path(object_metadata_shard_path) - assert os.path.exists(obj_abs_path) - - -def test_get_real_path_with_metadata_id(store, pids): - """Test get_real_path returns absolute path given a metadata id.""" +def test_get_hashstore_metadata_path_absolute_path(store, pids): + """Test _get_hashstore_metadata_path returns absolute path given a metadata id.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): @@ -1783,50 +1815,7 @@ def test_get_real_path_with_metadata_id(store, pids): assert os.path.exists(metadata_abs_path) -def test_build_hashstore_data_object_path(store, pids): - """Test _build_hashstore_data_object_path builds the hashstore data object file path.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _ = store._store_and_validate_data(pid, path) - # pylint: disable=W0212 - abs_path = store._build_hashstore_data_object_path(pids[pid][store.algorithm]) - assert os.path.exists(abs_path) - - -def test_count(pids, store): - """Check that count returns expected number of objects.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path_string = test_dir + pid.replace("/", "_") - store._store_and_validate_data(pid, path_string) - assert store._count(entity) == 3 - - -def test_cast_to_bytes(store): - """Test _to_bytes returns bytes.""" - string = "teststring" - # pylint: disable=W0212 - string_bytes = store._cast_to_bytes(string) - assert isinstance(string_bytes, bytes) - - -def test_get_hashstore_data_object_path(pids, store): - """Confirm resolve path returns correct object path""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - obj_resolved_path = store._get_hashstore_data_object_path(cid) - calculated_obj_path = store.objects + "/" + "/".join(store._shard(cid)) - - assert calculated_obj_path == obj_resolved_path - - -def test_get_hashstore_metadata_path_metadata(pids, store): +def test_get_hashstore_metadata_path_relative_path(pids, store): """Confirm resolve path returns correct metadata path.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1904,6 +1893,14 @@ def test_check_string(store): store._check_string(tab_line, "tab_line") +def test_cast_to_bytes(store): + """Test _to_bytes returns bytes.""" + string = "teststring" + # pylint: disable=W0212 + string_bytes = store._cast_to_bytes(string) + assert isinstance(string_bytes, bytes) + + def test_stream_reads_file(pids): """Test that a stream can read a file and yield its contents.""" test_dir = "tests/testdata/" From 3ab23098beff6e87e3e4847f644fb8ab9a157d28 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 09:27:01 -0700 Subject: [PATCH 387/420] Revert 'pyproject.toml' version and add version to 'hashstore/__init__.py' to prepare use of poetry bump version --- pyproject.toml | 2 +- src/hashstore/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 92d608f9..7aa46011 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hashstore" -version = "1.1.0" +version = "1.0.0" description = "HashStore, an object storage system using content identifiers." authors = ["Dou Mok ", "Matt Jones ", "Matthew Brooke", "Jing Tao", "Jeanette Clark", "Ian M. Nesbitt"] diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index a841efa3..dcf03937 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -19,3 +19,4 @@ from hashstore.hashstore import HashStore, HashStoreFactory __all__ = ("HashStore", "HashStoreFactory") +__version__ = "1.0.0" From 22624db56d237c69043e6d60b621a20c5aec46b8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 09:28:03 -0700 Subject: [PATCH 388/420] Add poetry bump version code to 'pyproject.toml' --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 7aa46011..41e45a0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,8 @@ python = ">=3.9" pathlib = ">=1.0.1" pyyaml = ">=6.0" +[tool.poetry_bumpversion.file."src/hashstore/__init__.py"] + [tool.poetry.group.dev.dependencies] pytest = ">=7.2.0" black = ">=22.10.0" From 50b8f8d044fc463b568e6288e6bfe7d6b867f9d5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 09:31:15 -0700 Subject: [PATCH 389/420] Update version to '1.1.0' via 'poetry version minor' --- pyproject.toml | 2 +- src/hashstore/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 41e45a0a..13d2e42b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hashstore" -version = "1.0.0" +version = "1.1.0" description = "HashStore, an object storage system using content identifiers." authors = ["Dou Mok ", "Matt Jones ", "Matthew Brooke", "Jing Tao", "Jeanette Clark", "Ian M. Nesbitt"] diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index dcf03937..be656f5e 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -19,4 +19,4 @@ from hashstore.hashstore import HashStore, HashStoreFactory __all__ = ("HashStore", "HashStoreFactory") -__version__ = "1.0.0" +__version__ = "1.1.0" From e78071195661616b455bb91953c840b4cf4e53f2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 10:13:51 -0700 Subject: [PATCH 390/420] Update 'README.md' --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 163fa1c4..eeff4e9d 100644 --- a/README.md +++ b/README.md @@ -311,9 +311,9 @@ To install `hashstore` locally, create a virtual environment for python 3.9+, install poetry, and then install or build the package with `poetry install` or `poetry build`, respectively. -To run tests, navigate to the root directory and run `pytest -s`. The test suite contains tests that +To run tests, navigate to the root directory and run `pytest`. The test suite contains tests that take a longer time to run (relating to the storage of large files) - to execute all tests, run -`pytest --run-slow`. To see detailed +`pytest --run-slow`. ## HashStore Client From 44b1f79675fa75f876b8cdc310235a337528cd55 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 12:15:33 -0700 Subject: [PATCH 391/420] Add type hints to all method signatures, fix minor issues related to typing inconsistencies and revise affected pytests --- src/hashstore/filehashstore.py | 280 ++++++++++-------- tests/filehashstore/test_filehashstore.py | 9 +- .../test_filehashstore_interface.py | 2 +- 3 files changed, 166 insertions(+), 125 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3fabd431..357e8774 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -11,6 +11,7 @@ import inspect import fcntl import yaml +from typing import List, Dict, Union, Optional, IO, Tuple, Set, Any from dataclasses import dataclass from pathlib import Path from contextlib import closing @@ -190,7 +191,9 @@ def __init__(self, properties=None): # Configuration and Related Methods @staticmethod - def _load_properties(hashstore_yaml_path, hashstore_required_prop_keys): + def _load_properties( + hashstore_yaml_path: str, hashstore_required_prop_keys: List[str] + ) -> Dict[str, Union[str, int]]: """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): @@ -222,7 +225,7 @@ def _load_properties(hashstore_yaml_path, hashstore_required_prop_keys): ) return hashstore_yaml_dict - def _write_properties(self, properties): + def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. @@ -290,8 +293,11 @@ def _write_properties(self, properties): @staticmethod def _build_hashstore_yaml_string( - store_depth, store_width, store_algorithm, store_metadata_namespace - ): + store_depth: int, + store_width: int, + store_algorithm: str, + store_metadata_namespace: str, + ) -> str: """Build a YAML string representing the configuration for a HashStore. :param int store_depth: Depth when sharding an object's hex digest. @@ -338,7 +344,9 @@ def _build_hashstore_yaml_string( """ return hashstore_configuration_yaml - def _verify_hashstore_properties(self, properties, prop_store_path): + def _verify_hashstore_properties( + self, properties: Dict[str, Union[str, int]], prop_store_path: str + ) -> None: """Determines whether FileHashStore can instantiate by validating a set of arguments and throwing exceptions. HashStore will not instantiate if an existing configuration file's properties (`hashstore.yaml`) are different from what is supplied - or if an @@ -392,7 +400,9 @@ def _verify_hashstore_properties(self, properties, prop_store_path): logging.critical(exception_string) raise RuntimeError(exception_string) - def _validate_properties(self, properties): + def _validate_properties( + self, properties: Dict[str, Union[str, int]] + ) -> Dict[str, Union[str, int]]: """Validate a properties dictionary by checking if it contains all the required keys and non-None values. @@ -496,13 +506,13 @@ def lookup_algo(algo_to_translate): def store_object( self, - pid=None, - data=None, - additional_algorithm=None, - checksum=None, - checksum_algorithm=None, - expected_object_size=None, - ): + pid: Optional[str] = None, + data: Optional[Union[str, bytes]] = None, + additional_algorithm: Optional[str] = None, + checksum: Optional[str] = None, + checksum_algorithm: Optional[str] = None, + expected_object_size: Optional[int] = None, + ) -> "ObjectMetadata": if pid is None and self._check_arg_data(data): # If no pid is supplied, store the object only without tagging logging.debug("FileHashStore - store_object: Request to store data only.") @@ -587,7 +597,7 @@ def store_object( return object_metadata - def tag_object(self, pid, cid): + def tag_object(self, pid: str, cid: str) -> None: logging.debug( "FileHashStore - tag_object: Tagging object cid: %s with pid: %s.", cid, @@ -612,8 +622,12 @@ def tag_object(self, pid, cid): raise PidRefsAlreadyExistsError(err_msg) def delete_if_invalid_object( - self, object_metadata, checksum, checksum_algorithm, expected_file_size - ): + self, + object_metadata: "ObjectMetadata", + checksum: str, + checksum_algorithm: str, + expected_file_size: int, + ) -> None: self._check_string(checksum, "checksum") self._check_string(checksum_algorithm, "checksum_algorithm") self._check_integer(expected_file_size) @@ -657,7 +671,9 @@ def delete_if_invalid_object( object_metadata.cid, ) - def store_metadata(self, pid, metadata, format_id=None): + def store_metadata( + self, pid: str, metadata: Union[str, bytes], format_id: Optional[str] = None + ) -> str: logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) @@ -717,7 +733,7 @@ def store_metadata(self, pid, metadata, format_id=None): self.metadata_locked_docs_th.remove(pid_doc) self.metadata_condition_th.notify() - def retrieve_object(self, pid): + def retrieve_object(self, pid: str) -> IO[bytes]: logging.debug( "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, @@ -746,7 +762,7 @@ def retrieve_object(self, pid): return obj_stream - def retrieve_metadata(self, pid, format_id=None): + def retrieve_metadata(self, pid: str, format_id: Optional[str] = None) -> IO[bytes]: logging.debug( "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, @@ -777,7 +793,7 @@ def retrieve_metadata(self, pid, format_id=None): logging.error(exception_string) raise ValueError(exception_string) - def delete_object(self, pid): + def delete_object(self, pid: str) -> None: logging.debug( "FileHashStore - delete_object: Request to delete object for id: %s", pid ) @@ -854,7 +870,7 @@ def delete_object(self, pid): self._rename_path_for_deletion(pid_ref_abs_path) ) # Remove pid from cid reference file - self._update_refs_file(cid_ref_abs_path, pid, "remove") + self._update_refs_file(Path(cid_ref_abs_path), pid, "remove") # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: debug_msg = ( @@ -926,7 +942,7 @@ def delete_object(self, pid): return except RefsFileExistsButCidObjMissing: # Add pid refs file to be permanently deleted - pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) + pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -934,8 +950,8 @@ def delete_object(self, pid): pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) cid_ref_abs_str = str(self._get_hashstore_cid_refs_path(pid_refs_cid)) # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_str): - self._update_refs_file(cid_ref_abs_str, pid, "remove") + if self._is_string_in_refs_file(pid, Path(cid_ref_abs_str)): + self._update_refs_file(Path(cid_ref_abs_str), pid, "remove") # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion @@ -970,7 +986,7 @@ def delete_object(self, pid): self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() - def delete_metadata(self, pid, format_id=None): + def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: logging.debug( "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, @@ -1100,7 +1116,7 @@ def delete_metadata(self, pid, format_id=None): self.metadata_locked_docs_th.remove(pid_doc) self.metadata_condition_th.notify() - def get_hex_digest(self, pid, algorithm): + def get_hex_digest(self, pid: str, algorithm: str) -> str: logging.debug( "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, @@ -1129,7 +1145,7 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def _find_object(self, pid): + def _find_object(self, pid: str) -> Dict[str, str]: """Check if an object referenced by a pid exists and retrieve its content identifier. The `find_object` method validates the existence of an object based on the provided pid and returns the associated content identifier. @@ -1157,7 +1173,7 @@ def _find_object(self, pid): cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file - if self._is_string_in_refs_file(pid, str(cid_ref_abs_path)): + if self._is_string_in_refs_file(pid, cid_ref_abs_path): # Object must also exist in order to return the cid retrieved if not self._exists("objects", pid_refs_cid): err_msg = ( @@ -1216,13 +1232,13 @@ def _find_object(self, pid): def _store_and_validate_data( self, - pid, - file, - additional_algorithm=None, - checksum=None, - checksum_algorithm=None, - file_size_to_validate=None, - ): + pid: str, + file: Union[str, bytes], + additional_algorithm: Optional[str] = None, + checksum: Optional[str] = None, + checksum_algorithm: Optional[str] = None, + file_size_to_validate: Optional[int] = None, + ) -> "ObjectMetadata": """Store contents of `file` on disk, validate the object's parameters if provided, and tag/reference the object. @@ -1266,7 +1282,7 @@ def _store_and_validate_data( ) return object_metadata - def _store_data_only(self, data): + def _store_data_only(self, data: Union[str, bytes]) -> "ObjectMetadata": """Store an object to HashStore and return a metadata object containing the content identifier, object file size and hex digests dictionary of the default algorithms. This method does not validate the object and writes directly to `/objects` after the hex @@ -1320,13 +1336,13 @@ def _store_data_only(self, data): def _move_and_get_checksums( self, - pid, - stream, - additional_algorithm=None, - checksum=None, - checksum_algorithm=None, - file_size_to_validate=None, - ): + pid: Optional[str], + stream: "Stream", + additional_algorithm: Optional[str] = None, + checksum: Optional[str] = None, + checksum_algorithm: Optional[str] = None, + file_size_to_validate: Optional[int] = None, + ) -> Tuple[str, int, Dict[str, str]]: """Copy the contents of the `Stream` object onto disk. The copy process uses a temporary file to store the initial contents and returns a dictionary of algorithms and their hex digest values. If the file already exists, the method will immediately @@ -1477,8 +1493,11 @@ def _move_and_get_checksums( return object_cid, tmp_file_size, hex_digests def _write_to_tmp_file_and_get_hex_digests( - self, stream, additional_algorithm=None, checksum_algorithm=None - ): + self, + stream: "Stream", + additional_algorithm: Optional[str] = None, + checksum_algorithm: Optional[str] = None, + ) -> Tuple[Dict[str, str], str, int]: """Create a named temporary file from a `Stream` object and return its filename and a dictionary of its algorithms and hex digests. If an additional and/or checksum algorithm is provided, it will add the respective hex digest to the dictionary if @@ -1491,6 +1510,7 @@ def _write_to_tmp_file_and_get_hex_digests( :return: tuple - hex_digest_dict, tmp.name - hex_digest_dict (dict): Algorithms and their hex digests. - tmp.name (str): Name of the temporary file created and written into. + - tmp_file_size (int): Size of the data object """ # Review additional hash object to digest and create new list algorithm_list_to_calculate = self._refine_algorithm_list( @@ -1567,7 +1587,7 @@ def _write_to_tmp_file_and_get_hex_digests( ) logging.error(exception_string) - def _mktmpfile(self, path): + def _mktmpfile(self, path: Path) -> IO[bytes]: """Create a temporary file at the given path ready to be written. :param Path path: Path to the file location. @@ -1596,7 +1616,7 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - def _store_hashstore_refs_files(self, pid, cid): + def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: """Create the pid refs file and create/update cid refs files in HashStore to establish the relationship between a 'pid' and a 'cid'. @@ -1711,7 +1731,7 @@ def _store_hashstore_refs_files(self, pid, cid): self._release_object_locked_cids(cid) self._release_reference_locked_pids(pid) - def _untag_object(self, pid, cid): + def _untag_object(self, pid: str, cid: str) -> None: """Untags a data object in HashStore by deleting the 'pid reference file' and removing the 'pid' from the 'cid reference file'. This method will never delete a data object. `_untag_object` will attempt to proceed with as much of the untagging process as @@ -1837,7 +1857,9 @@ def _untag_object(self, pid, cid): ) logging.warning(warn_msg) - def _put_metadata(self, metadata, pid, metadata_doc_name): + def _put_metadata( + self, metadata: Union[str, bytes], pid: str, metadata_doc_name: str + ) -> Path: """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. @@ -1895,7 +1917,7 @@ def _put_metadata(self, metadata, pid, metadata_doc_name): logging.error(exception_string) raise FileNotFoundError(exception_string) - def _mktmpmetadata(self, stream): + def _mktmpmetadata(self, stream: "Stream") -> str: """Create a named temporary file with `stream` (metadata). :param Stream stream: Metadata stream. @@ -1925,7 +1947,7 @@ def _mktmpmetadata(self, stream): # FileHashStore Utility & Supporting Methods @staticmethod - def _delete_marked_files(delete_list): + def _delete_marked_files(delete_list: list[str]) -> None: """Delete all the file paths in a given delete list. :param list delete_list: Persistent or authority-based identifier. @@ -1940,7 +1962,9 @@ def _delete_marked_files(delete_list): else: raise ValueError("delete_marked_files: list cannot be None") - def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): + def _mark_pid_refs_file_for_deletion( + self, pid: str, delete_list: List[str], pid_refs_path: Path + ) -> None: """Attempt to rename a pid refs file and add the renamed file to a provided list. :param str pid: Persistent or authority-based identifier. @@ -1957,7 +1981,9 @@ def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): ) logging.error(err_msg) - def _remove_pid_and_handle_cid_refs_deletion(self, pid, delete_list, cid_refs_path): + def _remove_pid_and_handle_cid_refs_deletion( + self, pid: str, delete_list: List[str], cid_refs_path: Path + ) -> None: """Attempt to remove a pid from a 'cid refs file' and add the 'cid refs file' to the delete list if it is empty. @@ -1979,7 +2005,9 @@ def _remove_pid_and_handle_cid_refs_deletion(self, pid, delete_list, cid_refs_pa ) logging.error(err_msg) - def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): + def _validate_and_check_cid_lock( + self, pid: str, cid: str, cid_to_check: str + ) -> None: """Confirm that the two content identifiers provided are equal and is locked to ensure thread safety. @@ -1998,7 +2026,7 @@ def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): raise ValueError(err_msg) self._check_object_locked_cids(cid) - def _write_refs_file(self, path, ref_id, ref_type): + def _write_refs_file(self, path: Path, ref_id: str, ref_type: str) -> str: """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the difference being that a cid reference file can potentially contain multiple @@ -2034,7 +2062,9 @@ def _write_refs_file(self, path, ref_id, ref_type): logging.error(exception_string) raise err - def _update_refs_file(self, refs_file_path, ref_id, update_type): + def _update_refs_file( + self, refs_file_path: Path, ref_id: str, update_type: str + ) -> None: """Add or remove an existing ref from a refs file. :param path refs_file_path: Absolute path to the refs file. @@ -2090,7 +2120,7 @@ def _update_refs_file(self, refs_file_path, ref_id, update_type): raise err @staticmethod - def _is_string_in_refs_file(ref_id, refs_file_path): + def _is_string_in_refs_file(ref_id: str, refs_file_path: Path) -> bool: """Check a reference file for a ref_id (`cid` or `pid`). :param str ref_id: Authority-based, persistent identifier or content identifier @@ -2109,15 +2139,15 @@ def _is_string_in_refs_file(ref_id, refs_file_path): def _verify_object_information( self, - pid, - checksum, - checksum_algorithm, - entity, - hex_digests, - tmp_file_name, - tmp_file_size, - file_size_to_validate, - ): + pid: Optional[str], + checksum: str, + checksum_algorithm: str, + entity: str, + hex_digests: Dict[str, str], + tmp_file_name: Optional[str], + tmp_file_size: int, + file_size_to_validate: int, + ) -> None: """Evaluates an object's integrity - if there is a mismatch, deletes the object in question and raises an exception. @@ -2198,12 +2228,12 @@ def _verify_object_information( def _verify_hashstore_references( self, - pid, - cid, - pid_refs_path=None, - cid_refs_path=None, - additional_log_string=None, - ): + pid: str, + cid: str, + pid_refs_path: Optional[Path] = None, + cid_refs_path: Optional[Path] = None, + additional_log_string: Optional[str] = None, + ) -> None: """Verifies that the supplied pid and pid reference file and content have been written successfully. @@ -2227,7 +2257,7 @@ def _verify_hashstore_references( if not os.path.exists(pid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " - + pid_refs_path + + str(pid_refs_path) + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) @@ -2235,7 +2265,7 @@ def _verify_hashstore_references( if not os.path.exists(cid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " - + cid_refs_path + + str(cid_refs_path) + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) @@ -2262,7 +2292,7 @@ def _verify_hashstore_references( logging.error(exception_string) raise CidRefsContentError(exception_string) - def _delete_object_only(self, cid): + def _delete_object_only(self, cid: str) -> None: """Attempt to delete an object based on the given content identifier (cid). If the object has any pids references and/or a cid refs file exists, the object will not be deleted. @@ -2314,8 +2344,11 @@ def _delete_object_only(self, cid): self.object_cid_condition_th.notify() def _check_arg_algorithms_and_checksum( - self, additional_algorithm, checksum, checksum_algorithm - ): + self, + additional_algorithm: Optional[str], + checksum: Optional[str], + checksum_algorithm: Optional[str], + ) -> Tuple[Optional[str], Optional[str]]: """Determines whether the caller has supplied the necessary arguments to validate an object with a checksum value. @@ -2343,7 +2376,7 @@ def _check_arg_algorithms_and_checksum( checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked - def _check_arg_format_id(self, format_id, method): + def _check_arg_format_id(self, format_id: str, method: str) -> str: """Determines the metadata namespace (format_id) to use for storing, retrieving, and deleting metadata. @@ -2364,7 +2397,9 @@ def _check_arg_format_id(self, format_id, method): checked_format_id = format_id return checked_format_id - def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): + def _refine_algorithm_list( + self, additional_algorithm: Optional[str], checksum_algorithm: Optional[str] + ) -> Set[str]: """Create the final list of hash algorithms to calculate. :param str additional_algorithm: Additional algorithm. @@ -2397,7 +2432,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): algorithm_list_to_calculate = set(algorithm_list_to_calculate) return algorithm_list_to_calculate - def _clean_algorithm(self, algorithm_string): + def _clean_algorithm(self, algorithm_string: str) -> str: """Format a string and ensure that it is supported and compatible with the Python `hashlib` library. @@ -2427,7 +2462,9 @@ def _clean_algorithm(self, algorithm_string): raise UnsupportedAlgorithm(exception_string) return cleaned_string - def _computehash(self, stream, algorithm=None): + def _computehash( + self, stream: Union["Stream", str, IO[bytes]], algorithm: Optional[str] = None + ) -> str: """Compute the hash of a file-like object (or string) using the store algorithm by default or with an optional supported algorithm. @@ -2448,7 +2485,7 @@ def _computehash(self, stream, algorithm=None): hex_digest = hashobj.hexdigest() return hex_digest - def _shard(self, checksum): + def _shard(self, checksum: str) -> List[str]: """Splits the given checksum into a list of tokens of length `self.width`, followed by the remainder. @@ -2468,7 +2505,7 @@ def _shard(self, checksum): :rtype: list """ - def compact(items): + def compact(items: List[Any]) -> List[Any]: """Return only truthy elements of `items`.""" # truthy_items = [] # for item in items: @@ -2486,7 +2523,7 @@ def compact(items): return hierarchical_list - def _count(self, entity): + def _count(self, entity: str) -> int: """Return the count of the number of files in the `root` directory. :param str entity: Desired entity type (ex. "objects", "metadata"). @@ -2515,7 +2552,7 @@ def _count(self, entity): count += 1 return count - def _exists(self, entity, file): + def _exists(self, entity: str, file: str) -> bool: """Check whether a given file id or path exists on disk. :param str entity: Desired entity type (e.g., "objects", "metadata"). @@ -2535,7 +2572,9 @@ def _exists(self, entity, file): except FileNotFoundError: return False - def _open(self, entity, file, mode="rb"): + def _open( + self, entity: str, file: str, mode: str = "rb" + ) -> Union[IO[bytes], IO[str]]: """Return open buffer object from given id or path. Caller is responsible for closing the stream. @@ -2559,7 +2598,7 @@ def _open(self, entity, file, mode="rb"): buffer = io.open(realpath, mode) return buffer - def _delete(self, entity, file): + def _delete(self, entity: str, file: Union[str, Path]) -> None: """Delete file using id or path. Remove any empty directories after deleting. No exception is raised if file doesn't exist. @@ -2594,7 +2633,7 @@ def _delete(self, entity, file): logging.error(exception_string) raise err - def _create_path(self, path): + def _create_path(self, path: Path) -> None: """Physically create the folder path (and all intermediate ones) on disk. :param Path path: The path to create. @@ -2605,7 +2644,7 @@ def _create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" - def _get_store_path(self, entity): + def _get_store_path(self, entity: str) -> Path: """Return a path object to the root directory of the requested hashstore directory type :param str entity: Desired entity type: "objects", "metadata", "refs", "cid" and "pid". @@ -2629,7 +2668,7 @@ def _get_store_path(self, entity): f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) - def _build_hashstore_data_object_path(self, hash_id): + def _build_hashstore_data_object_path(self, hash_id: str) -> str: """Build the absolute file path for a given content identifier :param str hash_id: A hash ID to build a file path for. @@ -2642,7 +2681,7 @@ def _build_hashstore_data_object_path(self, hash_id): absolute_path = os.path.join(root_dir, *paths) return absolute_path - def _get_hashstore_data_object_path(self, cid_or_relative_path): + def _get_hashstore_data_object_path(self, cid_or_relative_path: str) -> Path: """Get the expected path to a hashstore data object that exists using a content identifier. :param str cid_or_relative_path: Content identifier or relative path in '/objects' to check @@ -2654,16 +2693,16 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path): cid_or_relative_path ) if os.path.isfile(expected_abs_data_obj_path): - return expected_abs_data_obj_path + return Path(expected_abs_data_obj_path) else: if os.path.isfile(cid_or_relative_path): # Check whether the supplied arg is an abs path that exists or not for convenience - return cid_or_relative_path + return Path(cid_or_relative_path) else: # Check the relative path relpath = os.path.join(self.objects, cid_or_relative_path) if os.path.isfile(relpath): - return relpath + return Path(relpath) else: raise FileNotFoundError( "FileHashStore - _get_hashstore_data_object_path: could not locate a" @@ -2671,7 +2710,7 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path): + cid_or_relative_path ) - def _get_hashstore_metadata_path(self, metadata_relative_path): + def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: """Return the expected metadata path to a hashstore metadata object that exists. :param str metadata_relative_path: Metadata path to check or relative path in @@ -2683,11 +2722,11 @@ def _get_hashstore_metadata_path(self, metadata_relative_path): # Form the absolute path to the metadata file expected_abs_metadata_path = os.path.join(self.metadata, metadata_relative_path) if os.path.isfile(expected_abs_metadata_path): - return expected_abs_metadata_path + return Path(expected_abs_metadata_path) else: if os.path.isfile(metadata_relative_path): # Check whether the supplied arg is an abs path that exists or not for convenience - return metadata_relative_path + return Path(metadata_relative_path) else: raise FileNotFoundError( "FileHashStore - _get_hashstore_metadata_path: could not locate a" @@ -2695,7 +2734,7 @@ def _get_hashstore_metadata_path(self, metadata_relative_path): + metadata_relative_path ) - def _get_hashstore_pid_refs_path(self, pid): + def _get_hashstore_pid_refs_path(self, pid: str) -> Path: """Return the expected path to a pid reference file. The path may or may not exist. :param str pid: Persistent or authority-based identifier @@ -2708,9 +2747,9 @@ def _get_hashstore_pid_refs_path(self, pid): root_dir = self._get_store_path("pid") directories_and_path = self._shard(hash_id) pid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) - return pid_ref_file_abs_path + return Path(pid_ref_file_abs_path) - def _get_hashstore_cid_refs_path(self, cid): + def _get_hashstore_cid_refs_path(self, cid: str) -> Path: """Return the expected path to a cid reference file. The path may or may not exist. :param str cid: Content identifier @@ -2722,11 +2761,11 @@ def _get_hashstore_cid_refs_path(self, cid): # The content identifier is to be split into directories as is supplied directories_and_path = self._shard(cid) cid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) - return cid_ref_file_abs_path + return Path(cid_ref_file_abs_path) # Synchronization Methods - def _release_object_locked_pids(self, pid): + def _release_object_locked_pids(self, pid: str) -> None: """Remove the given persistent identifier from 'object_locked_pids' and notify other waiting threads or processes. @@ -2742,7 +2781,7 @@ def _release_object_locked_pids(self, pid): self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() - def _synchronize_object_locked_cids(self, cid): + def _synchronize_object_locked_cids(self, cid: str) -> None: """Multiple threads may access a data object via its 'cid' or the respective 'cid reference file' (which contains a list of 'pid's that reference a 'cid') and this needs to be coordinated. @@ -2776,7 +2815,7 @@ def _synchronize_object_locked_cids(self, cid): + f" cid: {cid}" ) - def _check_object_locked_cids(self, cid): + def _check_object_locked_cids(self, cid: str) -> None: """Check that a given content identifier is currently locked (found in the 'object_locked_cids' array). If it is not, an exception will be thrown. @@ -2793,7 +2832,7 @@ def _check_object_locked_cids(self, cid): logging.error(err_msg) raise IdentifierNotLocked(err_msg) - def _release_object_locked_cids(self, cid): + def _release_object_locked_cids(self, cid: str) -> None: """Remove the given content identifier from 'object_locked_cids' and notify other waiting threads or processes. @@ -2818,7 +2857,7 @@ def _release_object_locked_cids(self, cid): ) logging.debug(end_sync_debug_msg) - def _synchronize_referenced_locked_pids(self, pid): + def _synchronize_referenced_locked_pids(self, pid: str) -> None: """Multiple threads may interact with a pid (to tag, untag, delete) and these actions must be coordinated to prevent unexpected behaviour/race conditions that cause chaos. @@ -2851,7 +2890,7 @@ def _synchronize_referenced_locked_pids(self, pid): + f" for pid: {pid}" ) - def _check_reference_locked_pids(self, pid): + def _check_reference_locked_pids(self, pid: str) -> None: """Check that a given persistent identifier is currently locked (found in the 'reference_locked_pids' array). If it is not, an exception will be thrown. @@ -2868,7 +2907,7 @@ def _check_reference_locked_pids(self, pid): logging.error(err_msg) raise IdentifierNotLocked(err_msg) - def _release_reference_locked_pids(self, pid): + def _release_reference_locked_pids(self, pid: str) -> None: """Remove the given persistent identifier from 'reference_locked_pids' and notify other waiting threads or processes. @@ -2896,7 +2935,7 @@ def _release_reference_locked_pids(self, pid): # Other Static Methods @staticmethod - def _read_small_file_content(path_to_file): + def _read_small_file_content(path_to_file: Path): """Read the contents of a file with the given path. This method is not optimized for large files - so it should only be used for small files (like reference files). @@ -2910,10 +2949,10 @@ def _read_small_file_content(path_to_file): return content @staticmethod - def _rename_path_for_deletion(path): + def _rename_path_for_deletion(path: Union[Path, str]) -> str: """Rename a given path by appending '_delete' and move it to the renamed path. - :param string path: Path to file to rename + :param Path path: Path to file to rename :return: Path to the renamed file :rtype: str @@ -2922,10 +2961,11 @@ def _rename_path_for_deletion(path): path = Path(path) delete_path = path.with_name(path.stem + "_delete" + path.suffix) shutil.move(path, delete_path) - return delete_path + # TODO: Adjust all code for constructing paths to use path and revise accordingly + return str(delete_path) @staticmethod - def _get_file_paths(directory): + def _get_file_paths(directory: Union[str, Path]) -> Optional[List[Path]]: """Get the file paths of a given directory if it exists :param mixed directory: String or path to directory. @@ -2945,7 +2985,7 @@ def _get_file_paths(directory): return None @staticmethod - def _check_arg_data(data): + def _check_arg_data(data: Union[str, os.PathLike, io.BufferedReader]) -> bool: """Checks a data argument to ensure that it is either a string, path, or stream object. @@ -2976,7 +3016,7 @@ def _check_arg_data(data): return True @staticmethod - def _check_integer(file_size): + def _check_integer(file_size: int) -> None: """Check whether a given argument is an integer and greater than 0; throw an exception if not. @@ -2998,7 +3038,7 @@ def _check_integer(file_size): raise ValueError(exception_string) @staticmethod - def _check_string(string, arg): + def _check_string(string: str, arg: str) -> None: """Check whether a string is None or empty - or if it contains an illegal character; throws an exception if so. @@ -3015,7 +3055,7 @@ def _check_string(string, arg): raise ValueError(exception_string) @staticmethod - def _cast_to_bytes(text): + def _cast_to_bytes(text: any) -> bytes: """Convert text to a sequence of bytes using utf-8 encoding. :param Any text: String to convert. @@ -3027,7 +3067,7 @@ def _cast_to_bytes(text): return text -class Stream(object): +class Stream: """Common interface for file-like objects. The input `obj` can be a file-like object or a path to a file. If `obj` is @@ -3041,7 +3081,7 @@ class Stream(object): set its position back to ``0``. """ - def __init__(self, obj): + def __init__(self, obj: Union[IO[bytes], str]): if hasattr(obj, "read"): pos = obj.tell() elif os.path.isfile(obj): @@ -3099,7 +3139,7 @@ class ObjectMetadata: :param str pid: An authority-based or persistent identifier :param str cid: A unique identifier for the object (Hash ID, hex digest). :param int obj_size: The size of the object in bytes. - :param list hex_digests: A list of hex digests to validate objects + :param dict hex_digests: A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) (optional). """ diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 4c150b7f..ead33e2c 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1160,9 +1160,10 @@ def test_remove_pid_and_handle_cid_refs_deletion_cid_refs_empty(store): cid_refs_path = store._get_hashstore_cid_refs_path(cid) store._remove_pid_and_handle_cid_refs_deletion(pid, list_to_check, cid_refs_path) + delete_path = cid_refs_path.with_name(cid_refs_path.name + "_delete") assert not os.path.exists(cid_refs_path) - assert os.path.exists(cid_refs_path + "_delete") + assert os.path.exists(delete_path) assert len(list_to_check) == 1 @@ -1836,7 +1837,7 @@ def test_get_hashstore_metadata_path_relative_path(pids, store): store.metadata + "/" + rel_path + "/" + metadata_document_name ) - assert calculated_metadata_path == metadata_resolved_path + assert Path(calculated_metadata_path) == metadata_resolved_path def test_get_hashstore_pid_refs_path(pids, store): @@ -1852,7 +1853,7 @@ def test_get_hashstore_pid_refs_path(pids, store): store.pids + "/" + "/".join(store._shard(pid_refs_metadata_hashid)) ) - assert resolved_pid_ref_abs_path == calculated_pid_ref_path + assert resolved_pid_ref_abs_path == Path(calculated_pid_ref_path) def test_get_hashstore_cid_refs_path(pids, store): @@ -1866,7 +1867,7 @@ def test_get_hashstore_cid_refs_path(pids, store): resolved_cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid)) - assert resolved_cid_ref_abs_path == calculated_cid_ref_path + assert resolved_cid_ref_abs_path == Path(calculated_cid_ref_path) def test_check_string(store): diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 791d9d82..0f93a64a 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1104,7 +1104,7 @@ def test_store_metadata_metadata_path(pids, store): _object_metadata = store.store_object(pid, path) stored_metadata_path = store.store_metadata(pid, syspath, format_id) metadata_path = store._get_hashstore_metadata_path(stored_metadata_path) - assert stored_metadata_path == metadata_path + assert Path(stored_metadata_path) == metadata_path def test_store_metadata_thread_lock(store): From b9a563585b19ca279ac9bcdf6a14d7b0bce6cb7c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 15:11:54 -0700 Subject: [PATCH 392/420] Clean-up creation of paths in 'filehashstore' init process and revise affected code and pytests --- src/hashstore/filehashstore.py | 40 +++++++++---------- tests/filehashstore/test_filehashstore.py | 30 +++++++------- .../test_filehashstore_interface.py | 24 ++++++----- tests/test_hashstore_client.py | 14 +++---- 4 files changed, 53 insertions(+), 55 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 357e8774..1f6b5e4f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -143,12 +143,12 @@ def __init__(self, properties=None): ] # Check to see if a configuration is present in the given store path - self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" + self.hashstore_configuration_yaml = Path(prop_store_path) / "hashstore.yaml" self._verify_hashstore_properties(properties, prop_store_path) # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") - self.root = prop_store_path + self.root = Path(prop_store_path) self.depth = prop_store_depth self.width = prop_store_width self.sysmeta_ns = prop_store_metadata_namespace @@ -163,19 +163,19 @@ def __init__(self, properties=None): # Default algorithm list for FileHashStore based on config file written self._set_default_algorithms() # Complete initialization/instantiation by setting and creating store directories - self.objects = self.root + "/objects" - self.metadata = self.root + "/metadata" - self.refs = self.root + "/refs" - self.cids = self.refs + "/cids" - self.pids = self.refs + "/pids" + self.objects = self.root / "objects" + self.metadata = self.root / "metadata" + self.refs = self.root / "refs" + self.cids = self.refs / "cids" + self.pids = self.refs / "pids" if not os.path.exists(self.objects): - self._create_path(self.objects + "/tmp") + self._create_path(self.objects / "tmp") if not os.path.exists(self.metadata): - self._create_path(self.metadata + "/tmp") + self._create_path(self.metadata / "tmp") if not os.path.exists(self.refs): - self._create_path(self.refs + "/tmp") - self._create_path(self.refs + "/pids") - self._create_path(self.refs + "/cids") + self._create_path(self.refs / "tmp") + self._create_path(self.refs / "pids") + self._create_path(self.refs / "cids") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -192,7 +192,7 @@ def __init__(self, properties=None): @staticmethod def _load_properties( - hashstore_yaml_path: str, hashstore_required_prop_keys: List[str] + hashstore_yaml_path: Path, hashstore_required_prop_keys: List[str] ) -> Dict[str, Union[str, int]]: """Get and return the contents of the current HashStore configuration. @@ -1089,9 +1089,7 @@ def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: logging.debug(sync_begin_debug_msg) self.metadata_locked_docs_th.append(pid_doc) try: - full_path_without_directory = ( - self.metadata + "/" + rel_path + "/" + pid_doc - ) + full_path_without_directory = Path(self.metadata / rel_path / pid_doc) self._delete("metadata", full_path_without_directory) info_string = ( "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" @@ -1907,7 +1905,7 @@ def _put_metadata( "FileHashStore - _put_metadata: Deleting metadata for pid: %s", pid, ) - self.metadata.delete(metadata_tmp) + self._delete("metadata", metadata_tmp) raise else: exception_string = ( @@ -2541,7 +2539,7 @@ def _count(self, entity: str) -> int: elif entity == "cid": directory_to_count = self.cids elif entity == "tmp": - directory_to_count = self.objects + "tmp" + directory_to_count = self.objects / "tmp" else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" @@ -2713,8 +2711,8 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path: str) -> Path: def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: """Return the expected metadata path to a hashstore metadata object that exists. - :param str metadata_relative_path: Metadata path to check or relative path in - '/metadata' to check + :param str metadata_relative_path: Metadata path to check or relative path in '/metadata' + to check :return: Path to the data object referenced by the pid :rtype: Path @@ -2731,7 +2729,7 @@ def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: raise FileNotFoundError( "FileHashStore - _get_hashstore_metadata_path: could not locate a" + "metadata object in '/metadata' for the supplied metadata_relative_path: " - + metadata_relative_path + + str(metadata_relative_path) ) def _get_hashstore_pid_refs_path(self, pid: str) -> Path: diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index ead33e2c..4c27f41d 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -32,13 +32,13 @@ def test_init_directories_created(store): """Confirm that object and metadata directories have been created.""" assert os.path.exists(store.root) assert os.path.exists(store.objects) - assert os.path.exists(store.objects + "/tmp") + assert os.path.exists(store.objects / "tmp") assert os.path.exists(store.metadata) - assert os.path.exists(store.metadata + "/tmp") + assert os.path.exists(store.metadata / "tmp") assert os.path.exists(store.refs) - assert os.path.exists(store.refs + "/tmp") - assert os.path.exists(store.refs + "/pids") - assert os.path.exists(store.refs + "/cids") + assert os.path.exists(store.refs / "tmp") + assert os.path.exists(store.refs / "pids") + assert os.path.exists(store.refs / "cids") def test_init_existing_store_incorrect_algorithm_format(store): @@ -46,7 +46,7 @@ def test_init_existing_store_incorrect_algorithm_format(store): the string must exactly match the expected format). DataONE uses the library of congress vocabulary to standardize algorithm types.""" properties = { - "store_path": store.root + "/incorrect_algo_format", + "store_path": store.root / "incorrect_algo_format", "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", @@ -722,7 +722,7 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, def test_mktmpfile(store): """Test that _mktmpfile creates and returns a tmp file.""" - path = store.root + "/doutest/tmp/" + path = store.root / "doutest" / "tmp" store._create_path(path) tmp = store._mktmpfile(path) assert os.path.exists(tmp.name) @@ -1383,7 +1383,7 @@ def test_verify_object_information_incorrect_size_with_pid(pids, store): checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size - objects_tmp_folder = store.objects + "/tmp" + objects_tmp_folder = store.objects / "tmp" tmp_file = store._mktmpfile(objects_tmp_folder) assert os.path.isfile(tmp_file.name) with pytest.raises(NonMatchingObjSize): @@ -1758,7 +1758,7 @@ def test_create_path(pids, store): for pid in pids: root_directory = store.root pid_hex_digest_directory = pids[pid]["metadata_cid"][:2] - pid_directory = root_directory + pid_hex_digest_directory + pid_directory = root_directory / pid_hex_digest_directory store._create_path(pid_directory) assert os.path.isdir(pid_directory) @@ -1828,14 +1828,12 @@ def test_get_hashstore_metadata_path_relative_path(pids, store): metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) rel_path = "/".join(store._shard(metadata_directory)) - full_path_without_dir = rel_path + "/" + metadata_document_name + full_path_without_dir = Path(rel_path) / metadata_document_name metadata_resolved_path = store._get_hashstore_metadata_path( full_path_without_dir ) - calculated_metadata_path = ( - store.metadata + "/" + rel_path + "/" + metadata_document_name - ) + calculated_metadata_path = store.metadata / rel_path / metadata_document_name assert Path(calculated_metadata_path) == metadata_resolved_path @@ -1849,8 +1847,8 @@ def test_get_hashstore_pid_refs_path(pids, store): resolved_pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) pid_refs_metadata_hashid = store._computehash(pid) - calculated_pid_ref_path = ( - store.pids + "/" + "/".join(store._shard(pid_refs_metadata_hashid)) + calculated_pid_ref_path = store.pids / Path( + "/".join(store._shard(pid_refs_metadata_hashid)) ) assert resolved_pid_ref_abs_path == Path(calculated_pid_ref_path) @@ -1865,7 +1863,7 @@ def test_get_hashstore_cid_refs_path(pids, store): cid = object_metadata.cid resolved_cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid)) + calculated_cid_ref_path = store.cids / Path("/".join(store._shard(cid))) assert resolved_cid_ref_abs_path == Path(calculated_cid_ref_path) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 0f93a64a..772dffc8 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -650,9 +650,9 @@ def get_number_of_files(folder_path): file_count += len(files) return file_count - assert get_number_of_files(store.refs + "/pids") == 6 - assert get_number_of_files(store.refs + "/cids") == 1 - assert folder_has_files(store.refs + "/tmp") is False + assert get_number_of_files(store.refs / "pids") == 6 + assert get_number_of_files(store.refs / "cids") == 1 + assert folder_has_files(store.refs / "tmp") is False @slow_test @@ -1491,22 +1491,22 @@ def test_store_and_delete_objects_100_pids_1_cid(store): deletes all related files""" test_dir = "tests/testdata/" path = test_dir + "jtao.1700.1" + refs_pids_path = store.root / "refs" / "pids" + refs_cids_path = store.root / "refs" / "cids" # Store upper_limit = 101 for i in range(1, upper_limit): pid_modified = f"dou.test.{str(i)}" store.store_object(pid_modified, path) - assert ( - sum([len(files) for _, _, files in os.walk(store.root + "/refs/pids")]) == 100 - ) - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cids")]) == 1 + assert sum([len(files) for _, _, files in os.walk(refs_pids_path)]) == 100 + assert sum([len(files) for _, _, files in os.walk(refs_cids_path)]) == 1 assert store._count("objects") == 1 # Delete for i in range(1, upper_limit): pid_modified = f"dou.test.{str(i)}" store.delete_object(pid_modified) - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pids")]) == 0 - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cids")]) == 0 + assert sum([len(files) for _, _, files in os.walk(refs_pids_path)]) == 0 + assert sum([len(files) for _, _, files in os.walk(refs_cids_path)]) == 0 assert store._count("objects") == 0 @@ -1557,6 +1557,8 @@ def delete_object_wrapper(pid_var): thread5.join() thread6.join() - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 + refs_pids_path = store.root / "refs" / "pids" + refs_cids_path = store.root / "refs" / "cids" + assert sum([len(files) for _, _, files in os.walk(refs_pids_path)]) == 0 + assert sum([len(files) for _, _, files in os.walk(refs_cids_path)]) == 0 assert store._count("objects") == 0 diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 7e13f37a..f1e8f159 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -53,7 +53,7 @@ def test_get_checksum(capsys, store, pids): store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) get_checksum_opt = "-getchecksum" client_pid_arg = f"-pid={pid}" algo_arg = f"-algo={store.algorithm}" @@ -86,7 +86,7 @@ def test_store_object(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) store_object_opt = "-storeobject" client_pid_arg = f"-pid={pid}" path = f'-path={test_dir + pid.replace("/", "_")}' @@ -117,7 +117,7 @@ def test_store_metadata(capsys, store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) store_metadata_opt = "-storemetadata" client_pid_arg = f"-pid={pid}" path = f"-path={syspath}" @@ -159,7 +159,7 @@ def test_retrieve_objects(capsys, pids, store): store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) delete_object_opt = "-retrieveobject" client_pid_arg = f"-pid={pid}" chs_args = [ @@ -199,7 +199,7 @@ def test_retrieve_metadata(capsys, pids, store): _metadata_cid = store.store_metadata(pid, syspath, namespace) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) retrieve_metadata_opt = "-retrievemetadata" client_pid_arg = f"-pid={pid}" format_id = f"-formatid={namespace}" @@ -239,7 +239,7 @@ def test_delete_objects(pids, store): store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) delete_object_opt = "-deleteobject" client_pid_arg = f"-pid={pid}" chs_args = [ @@ -269,7 +269,7 @@ def test_delete_metadata(pids, store): _metadata_cid = store.store_metadata(pid, syspath, namespace) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) delete_metadata_opt = "-deletemetadata" client_pid_arg = f"-pid={pid}" format_id = f"-formatid={namespace}" From ff29ec1688d2e5a75648abf88a27820c0b3b1376 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 15:36:25 -0700 Subject: [PATCH 393/420] Clean-up docstrings for '_load_properties' and '_write_properties' in filehashstore --- src/hashstore/filehashstore.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1f6b5e4f..02d2a550 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -197,10 +197,10 @@ def _load_properties( """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): - - ``store_depth`` (int): Depth when sharding an object's hex digest. - - ``store_width`` (int): Width of directories when sharding an object's hex digest. - - ``store_algorithm`` (str): Hash algo used for calculating the object's hex digest. - - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. + - store_depth (int): Depth when sharding an object's hex digest. + - store_width (int): Width of directories when sharding an object's hex digest. + - store_algorithm (str): Hash algo used for calculating the object's hex digest. + - store_metadata_namespace (str): Namespace for the HashStore's system metadata. :rtype: dict """ if not os.path.exists(hashstore_yaml_path): @@ -229,12 +229,11 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. - :param properties: A Python dictionary with the following keys (and values): - - ``store_depth`` (int): Depth when sharding an object's hex digest. - - ``store_width`` (int): Width of directories when sharding an object's hex digest. - - ``store_algorithm`` (str): Hash algo used for calculating the object's hex digest. - - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. - :type properties: dict + :param dict properties: A Python dictionary with the following keys (and values): + - store_depth (int): Depth when sharding an object's hex digest. + - store_width (int): Width of directories when sharding an object's hex digest. + - store_algorithm (str): Hash algo used for calculating the object's hex digest. + - store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ # If hashstore.yaml already exists, must throw exception and proceed with caution if os.path.exists(self.hashstore_configuration_yaml): From d3a82a9440a172f399236e87cee63fa61c6260e1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 16:09:25 -0700 Subject: [PATCH 394/420] Refactor code to replace string concatenation for creating paths with using 'Path' directly and iterable/argument unpacking * --- src/hashstore/filehashstore.py | 17 +++++++++-------- tests/filehashstore/test_filehashstore.py | 15 ++++++--------- .../test_filehashstore_interface.py | 6 +++--- tests/test_hashstore_client.py | 2 +- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 02d2a550..68bcbefc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -678,8 +678,8 @@ def store_metadata( ) # Validate input parameters self._check_string(pid, "pid") - checked_format_id = self._check_arg_format_id(format_id, "store_metadata") self._check_arg_data(metadata) + checked_format_id = self._check_arg_format_id(format_id, "store_metadata") pid_doc = self._computehash(pid + checked_format_id) sync_begin_debug_msg = ( @@ -775,12 +775,13 @@ def retrieve_metadata(self, pid: str, format_id: Optional[str] = None) -> IO[byt metadata_document_name = self._computehash(pid + self.sysmeta_ns) else: metadata_document_name = self._computehash(pid + checked_format_id) - rel_path = "/".join(self._shard(metadata_directory)) - metadata_rel_path = rel_path + "/" + metadata_document_name - metadata_exists = self._exists(entity, metadata_rel_path) + metadata_rel_path = ( + Path(*self._shard(metadata_directory)) / metadata_document_name + ) + metadata_exists = self._exists(entity, str(metadata_rel_path)) if metadata_exists: - metadata_stream = self._open(entity, metadata_rel_path) + metadata_stream = self._open(entity, str(metadata_rel_path)) logging.info( "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid ) @@ -993,7 +994,7 @@ def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") metadata_directory = self._computehash(pid) - rel_path = "/".join(self._shard(metadata_directory)) + rel_path = Path(*self._shard(metadata_directory)) if format_id is None: # Delete all metadata documents @@ -1183,7 +1184,7 @@ def _find_object(self, pid: str) -> Dict[str, str]: else: sysmeta_doc_name = self._computehash(pid + self.sysmeta_ns) metadata_directory = self._computehash(pid) - metadata_rel_path = "/".join(self._shard(metadata_directory)) + metadata_rel_path = Path(*self._shard(metadata_directory)) sysmeta_full_path = ( self._get_store_path("metadata") / metadata_rel_path @@ -1878,7 +1879,7 @@ def _put_metadata( # Get target and related paths (permanent location) metadata_directory = self._computehash(pid) metadata_document_name = metadata_doc_name - rel_path = "/".join(self._shard(metadata_directory)) + rel_path = Path(*self._shard(metadata_directory)) full_path = self._get_store_path("metadata") / rel_path / metadata_document_name # Move metadata to target path diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 4c27f41d..6d2f7825 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1059,7 +1059,7 @@ def test_put_metadata_stored_path(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) @@ -1667,8 +1667,7 @@ def test_exists_object_with_sharded_path(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - object_metadata_shard = store._shard(object_metadata.cid) - object_metadata_shard_path = "/".join(object_metadata_shard) + object_metadata_shard_path = os.path.join(*store._shard(object_metadata.cid)) assert store._exists(entity, object_metadata_shard_path) @@ -1728,9 +1727,7 @@ def test_private_delete_metadata(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) - rel_path = ( - Path("/".join(store._shard(metadata_directory))) / metadata_document_name - ) + rel_path = Path(*store._shard(metadata_directory)) / metadata_document_name store._delete("metadata", rel_path) @@ -1827,7 +1824,7 @@ def test_get_hashstore_metadata_path_relative_path(pids, store): metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path_without_dir = Path(rel_path) / metadata_document_name metadata_resolved_path = store._get_hashstore_metadata_path( @@ -1848,7 +1845,7 @@ def test_get_hashstore_pid_refs_path(pids, store): resolved_pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) pid_refs_metadata_hashid = store._computehash(pid) calculated_pid_ref_path = store.pids / Path( - "/".join(store._shard(pid_refs_metadata_hashid)) + *store._shard(pid_refs_metadata_hashid) ) assert resolved_pid_ref_abs_path == Path(calculated_pid_ref_path) @@ -1863,7 +1860,7 @@ def test_get_hashstore_cid_refs_path(pids, store): cid = object_metadata.cid resolved_cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - calculated_cid_ref_path = store.cids / Path("/".join(store._shard(cid))) + calculated_cid_ref_path = store.cids / Path(*store._shard(cid)) assert resolved_cid_ref_abs_path == Path(calculated_cid_ref_path) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 772dffc8..381e1035 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -960,7 +960,7 @@ def test_store_metadata(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) @@ -976,7 +976,7 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_directory = store._computehash(pid) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" @@ -1008,7 +1008,7 @@ def test_store_metadata_default_format_id(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index f1e8f159..ba0bd566 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -139,7 +139,7 @@ def test_store_metadata(capsys, store, pids): metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + namespace) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) From dccb2603725e0a934448aa0c9d90d7b65419b201 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 16:16:31 -0700 Subject: [PATCH 395/420] Clean-up usage of string casting in 'delete_object' --- src/hashstore/filehashstore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 68bcbefc..11162b69 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -931,7 +931,7 @@ def delete_object(self, pid: str) -> None: return except OrphanPidRefsFileFound: # Delete pid refs file - pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) + pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -948,10 +948,10 @@ def delete_object(self, pid: str) -> None: ) # Remove pid from cid refs file pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) - cid_ref_abs_str = str(self._get_hashstore_cid_refs_path(pid_refs_cid)) + cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, Path(cid_ref_abs_str)): - self._update_refs_file(Path(cid_ref_abs_str), pid, "remove") + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "remove") # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion @@ -959,7 +959,7 @@ def delete_object(self, pid: str) -> None: return except PidNotFoundInCidRefsFile: # Add pid refs file to be permanently deleted - pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) + pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) From 91e30990ec4ec0cbb305b56f3a1c1e22c0404ee0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 16:20:31 -0700 Subject: [PATCH 396/420] Remove redundant usage of 'rtype' in docstrings since we have added type hints --- src/hashstore/filehashstore.py | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 11162b69..f500cbe0 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -201,7 +201,6 @@ def _load_properties( - store_width (int): Width of directories when sharding an object's hex digest. - store_algorithm (str): Hash algo used for calculating the object's hex digest. - store_metadata_namespace (str): Namespace for the HashStore's system metadata. - :rtype: dict """ if not os.path.exists(hashstore_yaml_path): exception_string = ( @@ -305,7 +304,6 @@ def _build_hashstore_yaml_string( :param str store_metadata_namespace: Namespace for the HashStore's system metadata. :return: A YAML string representing the configuration for a HashStore. - :rtype: str """ hashstore_configuration_yaml = f""" # Default configuration variables for HashStore @@ -411,7 +409,6 @@ def _validate_properties( :raises ValueError: If value is missing for a required key. :return: The given properties object (that has been validated). - :rtype: dict """ if not isinstance(properties, dict): exception_string = ( @@ -1150,7 +1147,7 @@ def _find_object(self, pid: str) -> Dict[str, str]: :param str pid: Authority-based or persistent identifier of the object. - :return: obj_info_dict (dict): + :return: obj_info_dict: - cid: content identifier - cid_object_path: path to the object - cid_refs_path: path to the cid refs file @@ -1866,7 +1863,6 @@ def _put_metadata( :param str metadata_doc_name: Metadata document name :return: Address of the metadata document. - :rtype: Path """ logging.debug( "FileHashStore - _put_metadata: Request to put metadata for pid: %s", pid @@ -1921,7 +1917,6 @@ def _mktmpmetadata(self, stream: "Stream") -> str: :param Stream stream: Metadata stream. :return: Path/name of temporary file created and written into. - :rtype: str """ # Create temporary file in .../{store_path}/tmp tmp_root_path = self._get_store_path("metadata") / "tmp" @@ -2035,7 +2030,6 @@ def _write_refs_file(self, path: Path, ref_id: str, ref_type: str) -> str: :param str ref_type: 'cid' or 'pid' :return: tmp_file_path - Path to the tmp refs file - :rtype: string """ logging.debug( "FileHashStore - _write_refs_file: Writing id (%s) into a tmp file in: %s", @@ -2125,7 +2119,6 @@ def _is_string_in_refs_file(ref_id: str, refs_file_path: Path) -> bool: :param path refs_file_path: Path to the refs file :return: pid_found - :rtype: boolean """ with open(refs_file_path, "r", encoding="utf8") as ref_file: # Confirm that pid is not currently already tagged @@ -2359,7 +2352,6 @@ def _check_arg_algorithms_and_checksum( :return: Hashlib-compatible string or 'None' for additional_algorithm and checksum_algorithm. - :rtype: str """ additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: @@ -2382,7 +2374,6 @@ def _check_arg_format_id(self, format_id: str, method: str) -> str: :param str method: Calling method for logging purposes. :return: Valid metadata namespace. - :rtype: str """ if format_id and not format_id.strip(): exception_string = f"FileHashStore - {method}: Format_id cannot be empty." @@ -2404,7 +2395,6 @@ def _refine_algorithm_list( :param str checksum_algorithm: Checksum algorithm. :return: De-duplicated list of hash algorithms. - :rtype: set """ algorithm_list_to_calculate = self.default_algo_list if checksum_algorithm is not None: @@ -2437,7 +2427,6 @@ def _clean_algorithm(self, algorithm_string: str) -> str: :param str algorithm_string: Algorithm to validate. :return: `hashlib` supported algorithm string. - :rtype: str """ count = 0 for char in algorithm_string: @@ -2471,7 +2460,6 @@ def _computehash( :param str algorithm: Algorithm of hex digest to generate. :return: Hex digest. - :rtype: str """ if algorithm is None: hashobj = hashlib.new(self.algorithm) @@ -2500,7 +2488,6 @@ def _shard(self, checksum: str) -> List[str]: :return: A list where each element is a token of fixed width, with any leftover characters as the last element. - :rtype: list """ def compact(items: List[Any]) -> List[Any]: @@ -2527,7 +2514,6 @@ def _count(self, entity: str) -> int: :param str entity: Desired entity type (ex. "objects", "metadata"). :return: Number of files in the directory. - :rtype: int """ count = 0 if entity == "objects": @@ -2557,7 +2543,6 @@ def _exists(self, entity: str, file: str) -> bool: :param str file: The name of the file to check. :return: True if the file exists. - :rtype: bool """ if entity == "objects": try: @@ -2581,7 +2566,6 @@ def _open( :param str mode: Mode to open file in. Defaults to 'rb'. :return: An `io` stream dependent on the `mode`. - :rtype: io.BufferedReader """ realpath = None if entity == "objects": @@ -2649,7 +2633,6 @@ def _get_store_path(self, entity: str) -> Path: Note, "cid" and "pid" are refs specific directories. :return: Path to requested store entity type - :rtype: Path """ if entity == "objects": return Path(self.objects) @@ -2672,7 +2655,6 @@ def _build_hashstore_data_object_path(self, hash_id: str) -> str: :param str hash_id: A hash ID to build a file path for. :return: An absolute file path for the specified hash ID. - :rtype: str """ paths = self._shard(hash_id) root_dir = self._get_store_path("objects") @@ -2685,7 +2667,6 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path: str) -> Path: :param str cid_or_relative_path: Content identifier or relative path in '/objects' to check :return: Path to the data object referenced by the pid - :rtype: Path """ expected_abs_data_obj_path = self._build_hashstore_data_object_path( cid_or_relative_path @@ -2715,7 +2696,6 @@ def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: to check :return: Path to the data object referenced by the pid - :rtype: Path """ # Form the absolute path to the metadata file expected_abs_metadata_path = os.path.join(self.metadata, metadata_relative_path) @@ -2738,7 +2718,6 @@ def _get_hashstore_pid_refs_path(self, pid: str) -> Path: :param str pid: Persistent or authority-based identifier :return: Path to pid reference file - :rtype: Path """ # The pid refs file is named after the hash of the pid using the store's algorithm hash_id = self._computehash(pid, self.algorithm) @@ -2753,7 +2732,6 @@ def _get_hashstore_cid_refs_path(self, cid: str) -> Path: :param str cid: Content identifier :return: Path to cid reference file - :rtype: Path """ root_dir = self._get_store_path("cid") # The content identifier is to be split into directories as is supplied @@ -2940,7 +2918,6 @@ def _read_small_file_content(path_to_file: Path): :param path path_to_file: Path to the file to read :return: Content of the given file - :rtype: str """ with open(path_to_file, "r", encoding="utf8") as opened_path: content = opened_path.read() @@ -2953,7 +2930,6 @@ def _rename_path_for_deletion(path: Union[Path, str]) -> str: :param Path path: Path to file to rename :return: Path to the renamed file - :rtype: str """ if isinstance(path, str): path = Path(path) @@ -2971,7 +2947,6 @@ def _get_file_paths(directory: Union[str, Path]) -> Optional[List[Path]]: :raises FileNotFoundError: If the directory doesn't exist :return: file_paths - File paths of the given directory or None if directory doesn't exist - :rtype: List """ if os.path.exists(directory): files = os.listdir(directory) @@ -2991,7 +2966,6 @@ def _check_arg_data(data: Union[str, os.PathLike, io.BufferedReader]) -> bool: :type data: str, os.PathLike, io.BufferedReader :return: True if valid. - :rtype: bool """ if ( not isinstance(data, str) @@ -3058,7 +3032,6 @@ def _cast_to_bytes(text: any) -> bytes: :param Any text: String to convert. :return: Bytes with utf-8 encoding. - :rtype: bytes """ if not isinstance(text, bytes): text = bytes(text, "utf8") From b44662b4218bba5f0cefbb37f579f513813a7cd7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 16:25:41 -0700 Subject: [PATCH 397/420] Fix incorrect usage of 'os.path.exists' when checking for files by using 'os.path.isfile' --- src/hashstore/filehashstore.py | 44 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f500cbe0..43d4609e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -153,7 +153,7 @@ def __init__(self, properties=None): self.width = prop_store_width self.sysmeta_ns = prop_store_metadata_namespace # Write 'hashstore.yaml' to store path - if not os.path.exists(self.hashstore_configuration_yaml): + if not os.path.isfile(self.hashstore_configuration_yaml): # pylint: disable=W1201 logging.debug( "FileHashStore - HashStore does not exist & configuration file not found." @@ -202,7 +202,7 @@ def _load_properties( - store_algorithm (str): Hash algo used for calculating the object's hex digest. - store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ - if not os.path.exists(hashstore_yaml_path): + if not os.path.isfile(hashstore_yaml_path): exception_string = ( "FileHashStore - load_properties: hashstore.yaml not found" + " in store root path." @@ -235,7 +235,7 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: - store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ # If hashstore.yaml already exists, must throw exception and proceed with caution - if os.path.exists(self.hashstore_configuration_yaml): + if os.path.isfile(self.hashstore_configuration_yaml): exception_string = ( "FileHashStore - write_properties: configuration file 'hashstore.yaml'" + " already exists." @@ -357,7 +357,7 @@ def _verify_hashstore_properties( :param dict properties: HashStore properties. :param str prop_store_path: Store path to check. """ - if os.path.exists(self.hashstore_configuration_yaml): + if os.path.isfile(self.hashstore_configuration_yaml): logging.debug( "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", self.hashstore_configuration_yaml, @@ -473,7 +473,7 @@ def lookup_algo(algo_to_translate): } return dataone_algo_translation[algo_to_translate] - if not os.path.exists(self.hashstore_configuration_yaml): + if not os.path.isfile(self.hashstore_configuration_yaml): exception_string = ( "FileHashStore - set_default_algorithms: hashstore.yaml not found" + " in store root path." @@ -1160,13 +1160,13 @@ def _find_object(self, pid: str) -> Dict[str, str]: self._check_string(pid, "pid") pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) - if os.path.exists(pid_ref_abs_path): + if os.path.isfile(pid_ref_abs_path): # Read the file to get the cid from the pid reference pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) # Confirm that the cid reference file exists cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) - if os.path.exists(cid_ref_abs_path): + if os.path.isfile(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file if self._is_string_in_refs_file(pid, cid_ref_abs_path): # Object must also exist in order to return the cid retrieved @@ -1196,7 +1196,7 @@ def _find_object(self, pid: str) -> Dict[str, str]: "pid_refs_path": pid_ref_abs_path, "sysmeta_path": ( sysmeta_full_path - if os.path.exists(sysmeta_full_path) + if os.path.isfile(sysmeta_full_path) else "Does not exist." ), } @@ -1482,7 +1482,7 @@ def _move_and_get_checksums( finally: # Ensure that the tmp file has been removed, the data object already exists, so it # is redundant. No exception is thrown so 'store_object' can proceed to tag object - if os.path.exists(tmp_file_name): + if os.path.isfile(tmp_file_name): self._delete("tmp", tmp_file_name) return object_cid, tmp_file_size, hex_digests @@ -1566,12 +1566,12 @@ def _write_to_tmp_file_and_get_hex_digests( + " Keyboard interruption by user." ) logging.error(exception_string) - if os.path.exists(tmp.name): + if os.path.isfile(tmp.name): os.remove(tmp.name) finally: if not tmp_file_completion_flag: try: - if os.path.exists(tmp.name): + if os.path.isfile(tmp.name): os.remove(tmp.name) # pylint: disable=W0718 except Exception as err: @@ -1597,7 +1597,7 @@ def _mktmpfile(self, path: Path) -> IO[bytes]: # Delete tmp file if python interpreter crashes or thread is interrupted def delete_tmp_file(): - if os.path.exists(tmp.name): + if os.path.isfile(tmp.name): os.remove(tmp.name) atexit.register(delete_tmp_file) @@ -1631,7 +1631,7 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: self._create_path(Path(os.path.dirname(pid_refs_path))) self._create_path(Path(os.path.dirname(cid_refs_path))) - if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): + if os.path.isfile(pid_refs_path) and os.path.isfile(cid_refs_path): # If both reference files exist, we confirm that reference files are where they # are expected to be and throw an exception to inform the client that everything # is in place - and include other issues for context @@ -1654,7 +1654,7 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: logging.error(rev_msg) raise HashStoreRefsAlreadyExists(err_msg) - elif os.path.exists(pid_refs_path) and not os.path.exists( + elif os.path.isfile(pid_refs_path) and not os.path.isfile( cid_refs_path ): # If pid refs exists, the pid has already been claimed and cannot be tagged we @@ -1666,7 +1666,7 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: logging.error(error_msg) raise PidRefsAlreadyExistsError(error_msg) - elif not os.path.exists(pid_refs_path) and os.path.exists( + elif not os.path.isfile(pid_refs_path) and os.path.isfile( cid_refs_path ): debug_msg = ( @@ -1879,7 +1879,7 @@ def _put_metadata( full_path = self._get_store_path("metadata") / rel_path / metadata_document_name # Move metadata to target path - if os.path.exists(metadata_tmp): + if os.path.isfile(metadata_tmp): try: parent = full_path.parent parent.mkdir(parents=True, exist_ok=True) @@ -1895,7 +1895,7 @@ def _put_metadata( f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - if os.path.exists(metadata_tmp): + if os.path.isfile(metadata_tmp): # Remove tmp metadata, calling app must re-upload logging.debug( "FileHashStore - _put_metadata: Deleting metadata for pid: %s", @@ -2068,7 +2068,7 @@ def _update_refs_file( + f" at refs file: {refs_file_path}." ) logging.debug(debug_msg) - if not os.path.exists(refs_file_path): + if not os.path.isfile(refs_file_path): exception_string = ( f"FileHashStore - _update_refs_file: {refs_file_path} does not exist." + f" Cannot {update_type} ref_id: {ref_id}" @@ -2245,7 +2245,7 @@ def _verify_hashstore_references( cid_refs_path = self._get_hashstore_cid_refs_path(cid) # Check that reference files were created - if not os.path.exists(pid_refs_path): + if not os.path.isfile(pid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " + str(pid_refs_path) @@ -2253,7 +2253,7 @@ def _verify_hashstore_references( ) logging.error(exception_string) raise PidRefsFileNotFound(exception_string) - if not os.path.exists(cid_refs_path): + if not os.path.isfile(cid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " + str(cid_refs_path) @@ -2291,7 +2291,7 @@ def _delete_object_only(self, cid: str) -> None: """ cid_refs_abs_path = self._get_hashstore_cid_refs_path(cid) # If the refs file still exists, do not delete the object - if not os.path.exists(cid_refs_abs_path): + if not os.path.isfile(cid_refs_abs_path): sync_begin_debug_msg = ( f"FileHashStore - delete_object: Cid ({cid}) to locked list." ) @@ -2598,7 +2598,7 @@ def _delete(self, entity: str, file: Union[str, Path]) -> None: except FileNotFoundError: # Swallow file not found exceptions for metadata realpath = None - elif os.path.exists(file): + elif os.path.isfile(file): # Check if the given path is an absolute path realpath = file else: From db5b8cd27466d62750db499d3446b59288318c00 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 16:41:37 -0700 Subject: [PATCH 398/420] Rename variables to resolve some linting typo warnings --- src/hashstore/filehashstore.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 43d4609e..c1b062f7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -65,8 +65,8 @@ class FileHashStore(HashStore): "store_metadata_namespace", ] # Permissions settings for writing files and creating directories - fmode = 0o664 - dmode = 0o755 + f_mode = 0o664 + d_mode = 0o755 # The other algorithm list consists of additional algorithms that can be included # for calculating when storing objects, in addition to the default list. other_algo_list = [ @@ -1603,12 +1603,12 @@ def delete_tmp_file(): atexit.register(delete_tmp_file) # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) + if self.f_mode is not None: + old_mask = os.umask(0) try: - os.chmod(tmp.name, self.fmode) + os.chmod(tmp.name, self.f_mode) finally: - os.umask(oldmask) + os.umask(old_mask) return tmp def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: @@ -2462,13 +2462,13 @@ def _computehash( :return: Hex digest. """ if algorithm is None: - hashobj = hashlib.new(self.algorithm) + hash_obj = hashlib.new(self.algorithm) else: check_algorithm = self._clean_algorithm(algorithm) - hashobj = hashlib.new(check_algorithm) + hash_obj = hashlib.new(check_algorithm) for data in stream: - hashobj.update(self._cast_to_bytes(data)) - hex_digest = hashobj.hexdigest() + hash_obj.update(self._cast_to_bytes(data)) + hex_digest = hash_obj.hexdigest() return hex_digest def _shard(self, checksum: str) -> List[str]: @@ -2622,7 +2622,7 @@ def _create_path(self, path: Path) -> None: :raises AssertionError: If the path already exists but is not a directory. """ try: - os.makedirs(path, self.dmode) + os.makedirs(path, self.d_mode) except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" From f060c3e41ba5360843f1f62dc0f3978031dcce9f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 25 Sep 2024 05:43:48 -0700 Subject: [PATCH 399/420] Update 'Stream' class constructor doc string to also include 'Path' and clean-up pytest --- src/hashstore/filehashstore.py | 2 +- tests/filehashstore/test_filehashstore.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c1b062f7..c7d727dd 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -3052,7 +3052,7 @@ class Stream: set its position back to ``0``. """ - def __init__(self, obj: Union[IO[bytes], str]): + def __init__(self, obj: Union[IO[bytes], str, Path]): if hasattr(obj, "read"): pos = obj.tell() elif os.path.isfile(obj): diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 6d2f7825..aa4b8529 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1917,11 +1917,11 @@ def test_stream_reads_path_object(pids): for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) obj_stream = Stream(path) - hashobj = hashlib.new("sha256") + hash_obj = hashlib.new("sha256") for data in obj_stream: - hashobj.update(data) + hash_obj.update(data) obj_stream.close() - hex_digest = hashobj.hexdigest() + hex_digest = hash_obj.hexdigest() assert pids[pid]["sha256"] == hex_digest From 6daa01f9a94b2fd2aeade26b595cd85f733e77ca Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 25 Sep 2024 05:48:09 -0700 Subject: [PATCH 400/420] Clean-up linting warnings in 'hashstoreclient' and 'filehashstore' pytest module --- src/hashstore/hashstoreclient.py | 17 ++++++++++++----- tests/filehashstore/test_filehashstore.py | 1 - 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 9575a908..b1c1cc67 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -193,7 +193,8 @@ def __init__(self): help="Flag to delete a metadata document from a HashStore", ) - def load_store_properties(self, hashstore_yaml): + @staticmethod + def load_store_properties(hashstore_yaml): """Get and return the contents of the current HashStore config file. :return: HashStore properties with the following keys (and values): @@ -291,6 +292,7 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num, skip_obj_size) logging.info(info_msg) # Get list of objects to store from metacat db + checked_obj_list = None if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "store" @@ -373,6 +375,7 @@ def retrieve_and_validate_from_hashstore( # Get list of objects to store from metacat db logging.info("HashStore Client - Refining object list for %s", obj_type) + checked_obj_list = None if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "retrieve" @@ -469,6 +472,7 @@ def delete_objects_from_list(self, origin_dir, obj_type, num, skip_obj_size): ) # Get list of objects to store from metacat db + checked_obj_list = None if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "delete" @@ -597,8 +601,9 @@ def get_object_metadata_list(self, origin_directory, num, skip_obj_size=None): limit_query = f" LIMIT {num}" query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, systemmetadata.object_format, systemmetadata.checksum, - systemmetadata.checksum_algorithm, systemmetadata.size FROM identifier INNER JOIN systemmetadata - ON identifier.guid = systemmetadata.guid ORDER BY identifier.guid{limit_query};""" + systemmetadata.checksum_algorithm, systemmetadata.size FROM identifier INNER JOIN + systemmetadata ON identifier.guid = systemmetadata.guid ORDER BY + identifier.guid{limit_query};""" cursor.execute(query) # Fetch all rows from the result set @@ -639,7 +644,8 @@ def get_object_metadata_list(self, origin_directory, num, skip_obj_size=None): return object_metadata_list - def refine_list_for_objects(self, metacat_obj_list, action): + @staticmethod + def refine_list_for_objects(metacat_obj_list, action): """Refine a list of objects by checking for file existence and removing duplicates. :param List metacat_obj_list: List of tuple objects representing rows from Metacat database. @@ -681,7 +687,8 @@ def refine_list_for_objects(self, metacat_obj_list, action): return refined_object_list - def refine_list_for_metadata(self, metacat_obj_list, action): + @staticmethod + def refine_list_for_metadata(metacat_obj_list, action): """Refine a list of metadata by checking for file existence and removing duplicates. :param List metacat_obj_list: List of tuple objects representing rows from metacat db. diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index aa4b8529..d21b0f10 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -364,7 +364,6 @@ def test_store_and_validate_data_files_path(pids, store): def test_store_and_validate_data_files_string(pids, store): """Test _store_and_validate_data accepts string for the path arg.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) From 27b7060f3cbfdb73893fb5535e6c3abc8787fb37 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 25 Sep 2024 05:52:31 -0700 Subject: [PATCH 401/420] Resolve linting warning for intentional pytest with 'noinspection PyTypeChecker' --- tests/filehashstore/test_filehashstore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index d21b0f10..0b0e94c3 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1940,6 +1940,7 @@ def test_stream_returns_to_original_position_on_close(pids): input_stream.close() +# noinspection PyTypeChecker def test_stream_raises_error_for_invalid_object(): """Test that a stream raises ValueError for an invalid input object.""" with pytest.raises(ValueError): From efd3cb5b1ee6002a6f6dbc80bc1d8e053a0c7b0b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 26 Sep 2024 12:24:01 -0700 Subject: [PATCH 402/420] Refactor '_build_hashstore_yaml_string' to create yaml content portion with 'yaml' library, and then join separately with a comments string to minimize yaml gotchas --- src/hashstore/filehashstore.py | 76 +++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c7d727dd..8e2f095a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -290,7 +290,7 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: return @staticmethod - def _build_hashstore_yaml_string( + def _build_hashstore_yaml_string_old( store_depth: int, store_width: int, store_algorithm: str, @@ -341,6 +341,80 @@ def _build_hashstore_yaml_string( """ return hashstore_configuration_yaml + @staticmethod + def _build_hashstore_yaml_string( + store_depth: int, + store_width: int, + store_algorithm: str, + store_metadata_namespace: str, + ) -> str: + """Build a YAML string representing the configuration for a HashStore. + + :param int store_depth: Depth when sharding an object's hex digest. + :param int store_width: Width of directories when sharding an object's hex digest. + :param str store_algorithm: Hash algorithm used for calculating the object's hex digest. + :param str store_metadata_namespace: Namespace for the HashStore's system metadata. + + :return: A YAML string representing the configuration for a HashStore. + """ + hashstore_configuration = { + "store_depth": store_depth, + "store_width": store_width, + "store_metadata_namespace": store_metadata_namespace, + "store_algorithm": store_algorithm, + "store_default_algo_list": [ + "MD5", + "SHA-1", + "SHA-256", + "SHA-384", + "SHA-512", + ], + } + + # The tabbing here is intentional otherwise the created .yaml will have extra tabs + hashstore_configuration_comments = f""" +# Default configuration variables for HashStore + +############### HashStore Config Notes ############### +############### Directory Structure ############### +# store_depth +# - Desired amount of directories when sharding an object to form the permanent address +# - **WARNING**: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE +# +# store_width +# - Width of directories created when sharding an object to form the permanent address +# - **WARNING**: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE +# +# Example: +# Below, objects are shown listed in directories that are 3 levels deep (DIR_DEPTH=3), +# with each directory consisting of 2 characters (DIR_WIDTH=2). +# /var/filehashstore/objects +# ├── 7f +# │ └── 5c +# │ └── c1 +# │ └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 + +############### Format of the Metadata ############### +# store_metadata_namespace +# - The default metadata format (ex. system metadata) + +############### Hash Algorithms ############### +# store_algorithm +# - Hash algorithm to use when calculating object's hex digest for the permanent address +# +# store_default_algo_list +# - Algorithm values supported by python hashlib 3.9.0+ for File Hash Store (FHS) +# - The default algorithm list includes the hash algorithms calculated when storing an +# - object to disk and returned to the caller after successful storage. + +""" + + hashstore_yaml_with_comments = hashstore_configuration_comments + yaml.dump( + hashstore_configuration, sort_keys=False + ) + + return hashstore_yaml_with_comments + def _verify_hashstore_properties( self, properties: Dict[str, Union[str, int]], prop_store_path: str ) -> None: From d35e9ac6cffaba82af761b61ea872cb8feb0b67f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 26 Sep 2024 15:38:34 -0700 Subject: [PATCH 403/420] Refactor the init process by declaring paralellization variables after properties have been checked --- src/hashstore/filehashstore.py | 104 ++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 47 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8e2f095a..9992656e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -80,53 +80,6 @@ class FileHashStore(HashStore): ] def __init__(self, properties=None): - # Variables to orchestrate parallelization - # Check to see whether a multiprocessing or threading sync lock should be used - self.use_multiprocessing = os.getenv("USE_MULTIPROCESSING", "False") == "True" - if self.use_multiprocessing == "True": - # Create multiprocessing synchronization variables - # Synchronization values for object locked pids - self.object_pid_lock_mp = multiprocessing.Lock() - self.object_pid_condition_mp = multiprocessing.Condition( - self.object_pid_lock_mp - ) - self.object_locked_pids_mp = multiprocessing.Manager().list() - # Synchronization values for object locked cids - self.object_cid_lock_mp = multiprocessing.Lock() - self.object_cid_condition_mp = multiprocessing.Condition( - self.object_cid_lock_mp - ) - self.object_locked_cids_mp = multiprocessing.Manager().list() - # Synchronization values for metadata locked documents - self.metadata_lock_mp = multiprocessing.Lock() - self.metadata_condition_mp = multiprocessing.Condition( - self.metadata_lock_mp - ) - self.metadata_locked_docs_mp = multiprocessing.Manager().list() - # Synchronization values for reference locked pids - self.reference_pid_lock_mp = multiprocessing.Lock() - self.reference_pid_condition_mp = multiprocessing.Condition( - self.reference_pid_lock_mp - ) - self.reference_locked_pids_mp = multiprocessing.Manager().list() - else: - # Create threading synchronization variables - # Synchronization values for object locked pids - self.object_pid_lock_th = threading.Lock() - self.object_pid_condition_th = threading.Condition(self.object_pid_lock_th) - self.object_locked_pids_th = [] - # Synchronization values for object locked cids - self.object_cid_lock_th = threading.Lock() - self.object_cid_condition_th = threading.Condition(self.object_cid_lock_th) - self.object_locked_cids_th = [] - # Synchronization values for metadata locked documents - self.metadata_lock_th = threading.Lock() - self.metadata_condition_th = threading.Condition(self.metadata_lock_th) - self.metadata_locked_docs_th = [] - # Synchronization values for reference locked pids - self.reference_pid_lock_th = threading.Lock() - self.reference_pid_condition_th = threading.Condition(self.metadata_lock_th) - self.reference_locked_pids_th = [] # Now check properties if properties: # Validate properties against existing configuration if present @@ -176,6 +129,63 @@ def __init__(self, properties=None): self._create_path(self.refs / "tmp") self._create_path(self.refs / "pids") self._create_path(self.refs / "cids") + + # Variables to orchestrate parallelization + # Check to see whether a multiprocessing or threading sync lock should be used + self.use_multiprocessing = ( + os.getenv("USE_MULTIPROCESSING", "False") == "True" + ) + if self.use_multiprocessing == "True": + # Create multiprocessing synchronization variables + # Synchronization values for object locked pids + self.object_pid_lock_mp = multiprocessing.Lock() + self.object_pid_condition_mp = multiprocessing.Condition( + self.object_pid_lock_mp + ) + self.object_locked_pids_mp = multiprocessing.Manager().list() + # Synchronization values for object locked cids + self.object_cid_lock_mp = multiprocessing.Lock() + self.object_cid_condition_mp = multiprocessing.Condition( + self.object_cid_lock_mp + ) + self.object_locked_cids_mp = multiprocessing.Manager().list() + # Synchronization values for metadata locked documents + self.metadata_lock_mp = multiprocessing.Lock() + self.metadata_condition_mp = multiprocessing.Condition( + self.metadata_lock_mp + ) + self.metadata_locked_docs_mp = multiprocessing.Manager().list() + # Synchronization values for reference locked pids + self.reference_pid_lock_mp = multiprocessing.Lock() + self.reference_pid_condition_mp = multiprocessing.Condition( + self.reference_pid_lock_mp + ) + self.reference_locked_pids_mp = multiprocessing.Manager().list() + else: + # Create threading synchronization variables + # Synchronization values for object locked pids + self.object_pid_lock_th = threading.Lock() + self.object_pid_condition_th = threading.Condition( + self.object_pid_lock_th + ) + self.object_locked_pids_th = [] + # Synchronization values for object locked cids + self.object_cid_lock_th = threading.Lock() + self.object_cid_condition_th = threading.Condition( + self.object_cid_lock_th + ) + self.object_locked_cids_th = [] + # Synchronization values for metadata locked documents + self.metadata_lock_th = threading.Lock() + self.metadata_condition_th = threading.Condition(self.metadata_lock_th) + self.metadata_locked_docs_th = [] + # Synchronization values for reference locked pids + self.reference_pid_lock_th = threading.Lock() + self.reference_pid_condition_th = threading.Condition( + self.metadata_lock_th + ) + self.reference_locked_pids_th = [] + logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) From dedd3a537c9b18192e20f6b0981b19d53526b49b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 26 Sep 2024 15:42:16 -0700 Subject: [PATCH 404/420] Begin clean up logging statements by adding new method '_get_logger' to get a logger instance for the 'filehashstore' module name and revise init process --- src/hashstore/filehashstore.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9992656e..f2295fe3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -80,6 +80,7 @@ class FileHashStore(HashStore): ] def __init__(self, properties=None): + self.fhs_logger = logging.getLogger(__name__) # Now check properties if properties: # Validate properties against existing configuration if present @@ -100,7 +101,7 @@ def __init__(self, properties=None): self._verify_hashstore_properties(properties, prop_store_path) # If no exceptions thrown, FileHashStore ready for initialization - logging.debug("FileHashStore - Initializing, properties verified.") + self.fhs_logger.debug("Initializing, properties verified.") self.root = Path(prop_store_path) self.depth = prop_store_depth self.width = prop_store_width @@ -108,8 +109,8 @@ def __init__(self, properties=None): # Write 'hashstore.yaml' to store path if not os.path.isfile(self.hashstore_configuration_yaml): # pylint: disable=W1201 - logging.debug( - "FileHashStore - HashStore does not exist & configuration file not found." + self.fhs_logger.debug( + "HashStore does not exist & configuration file not found." + " Writing configuration file." ) self._write_properties(properties) @@ -186,16 +187,13 @@ def __init__(self, properties=None): ) self.reference_locked_pids_th = [] - logging.debug( - "FileHashStore - Initialization success. Store root: %s", self.root - ) + self.fhs_logger.debug("Initialization success. Store root: %s", self.root) else: # Cannot instantiate or initialize FileHashStore without config exception_string = ( - "FileHashStore - HashStore properties must be supplied." - + f" Properties: {properties}" + "HashStore properties must be supplied." + f" Properties: {properties}" ) - logging.debug(exception_string) + self.fhs_logger.debug(exception_string) raise ValueError(exception_string) # Configuration and Related Methods From 71827f578044f7045696a3291af3bf2376b07b98 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 26 Sep 2024 16:07:08 -0700 Subject: [PATCH 405/420] Clean up code by renaming all 'exception_message' variables with 'err_msg' --- src/hashstore/filehashstore.py | 268 ++++++++++++++++----------------- 1 file changed, 130 insertions(+), 138 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f2295fe3..51299cdb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -190,11 +190,11 @@ def __init__(self, properties=None): self.fhs_logger.debug("Initialization success. Store root: %s", self.root) else: # Cannot instantiate or initialize FileHashStore without config - exception_string = ( + err_msg = ( "HashStore properties must be supplied." + f" Properties: {properties}" ) - self.fhs_logger.debug(exception_string) - raise ValueError(exception_string) + self.fhs_logger.debug(err_msg) + raise ValueError(err_msg) # Configuration and Related Methods @@ -211,12 +211,12 @@ def _load_properties( - store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ if not os.path.isfile(hashstore_yaml_path): - exception_string = ( + err_msg = ( "FileHashStore - load_properties: hashstore.yaml not found" + " in store root path." ) - logging.critical(exception_string) - raise FileNotFoundError(exception_string) + logging.critical(err_msg) + raise FileNotFoundError(err_msg) # Open file with open(hashstore_yaml_path, "r", encoding="utf-8") as hs_yaml_file: @@ -244,12 +244,12 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: """ # If hashstore.yaml already exists, must throw exception and proceed with caution if os.path.isfile(self.hashstore_configuration_yaml): - exception_string = ( + err_msg = ( "FileHashStore - write_properties: configuration file 'hashstore.yaml'" + " already exists." ) - logging.error(exception_string) - raise FileExistsError(exception_string) + logging.error(err_msg) + raise FileExistsError(err_msg) # Validate properties checked_properties = self._validate_properties(properties) @@ -265,14 +265,14 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: if store_algorithm in accepted_store_algorithms: checked_store_algorithm = store_algorithm else: - exception_string = ( + err_msg = ( f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" f" cannot be used as default for HashStore. Must be one of: " + f"{', '.join(accepted_store_algorithms)}" f" which are DataONE controlled algorithm values" ) - logging.error(exception_string) - raise ValueError(exception_string) + logging.error(err_msg) + raise ValueError(err_msg) # If given store path doesn't exist yet, create it. if not os.path.exists(self.root): @@ -455,13 +455,13 @@ def _verify_hashstore_properties( if key == "store_depth" or key == "store_width": supplied_key = int(properties[key]) if hashstore_yaml_dict[key] != supplied_key: - exception_string = ( + err_msg = ( f"FileHashStore - Given properties ({key}: {properties[key]}) does not" + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + f" found at: {self.hashstore_configuration_yaml}" ) - logging.critical(exception_string) - raise ValueError(exception_string) + logging.critical(err_msg) + raise ValueError(err_msg) else: if os.path.exists(prop_store_path): # Check if HashStore exists and throw exception if found @@ -470,14 +470,14 @@ def _verify_hashstore_properties( os.path.isdir(os.path.join(prop_store_path, sub)) for sub in subfolders ): - exception_string = ( + err_msg = ( "FileHashStore - Unable to initialize HashStore. `hashstore.yaml` is not" + " present but conflicting HashStore directory exists. Please delete" + " '/objects', '/metadata' and/or '/refs' at the store path or supply" + " a new path." ) - logging.critical(exception_string) - raise RuntimeError(exception_string) + logging.critical(err_msg) + raise RuntimeError(err_msg) def _validate_properties( self, properties: Dict[str, Union[str, int]] @@ -493,33 +493,33 @@ def _validate_properties( :return: The given properties object (that has been validated). """ if not isinstance(properties, dict): - exception_string = ( + err_msg = ( "FileHashStore - _validate_properties: Invalid argument -" + " expected a dictionary." ) - logging.debug(exception_string) - raise ValueError(exception_string) + logging.debug(err_msg) + raise ValueError(err_msg) # New dictionary for validated properties checked_properties = {} for key in self.property_required_keys: if key not in properties: - exception_string = ( + err_msg = ( "FileHashStore - _validate_properties: Missing required" + f" key: {key}." ) - logging.debug(exception_string) - raise KeyError(exception_string) + logging.debug(err_msg) + raise KeyError(err_msg) value = properties.get(key) if value is None: - exception_string = ( + err_msg = ( "FileHashStore - _validate_properties: Value for key:" + f" {key} is none." ) - logging.debug(exception_string) - raise ValueError(exception_string) + logging.debug(err_msg) + raise ValueError(err_msg) # Add key and values to checked_properties if key == "store_depth" or key == "store_width": @@ -527,13 +527,13 @@ def _validate_properties( try: checked_properties[key] = int(value) except Exception as err: - exception_string = ( + err_msg = ( "FileHashStore - _validate_properties: Unexpected exception when" " attempting to ensure store depth and width are integers. Details: " + str(err) ) - logging.debug(exception_string) - raise ValueError(exception_string) + logging.debug(err_msg) + raise ValueError(err_msg) else: checked_properties[key] = value @@ -556,12 +556,12 @@ def lookup_algo(algo_to_translate): return dataone_algo_translation[algo_to_translate] if not os.path.isfile(self.hashstore_configuration_yaml): - exception_string = ( + err_msg = ( "FileHashStore - set_default_algorithms: hashstore.yaml not found" + " in store root path." ) - logging.critical(exception_string) - raise FileNotFoundError(exception_string) + logging.critical(err_msg) + raise FileNotFoundError(err_msg) with open( self.hashstore_configuration_yaml, "r", encoding="utf-8" @@ -662,12 +662,12 @@ def store_object( pid, ) except Exception as err: - exception_string = ( + err_msg = ( f"FileHashStore - store_object: failed to store object for pid: {pid}." + " Reference files will not be created or tagged. Unexpected error: " + str(err) ) - logging.error(exception_string) + logging.error(err_msg) raise err finally: # Release pid @@ -710,12 +710,12 @@ def delete_if_invalid_object( self._check_string(checksum_algorithm, "checksum_algorithm") self._check_integer(expected_file_size) if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): - exception_string = ( + err_msg = ( "FileHashStore - verify_object: 'object_metadata' cannot be None." + " Must be a 'ObjectMetadata' object." ) - logging.error(exception_string) - raise ValueError(exception_string) + logging.error(err_msg) + raise ValueError(err_msg) else: logging.info( "FileHashStore - verify_object: Called to verify object with id: %s", @@ -829,11 +829,9 @@ def retrieve_object(self, pid: str) -> IO[bytes]: ) obj_stream = self._open(entity, object_cid) else: - exception_string = ( - f"FileHashStore - retrieve_object: No object found for pid: {pid}" - ) - logging.error(exception_string) - raise ValueError(exception_string) + err_msg = f"FileHashStore - retrieve_object: No object found for pid: {pid}" + logging.error(err_msg) + raise ValueError(err_msg) logging.info( "FileHashStore - retrieve_object: Retrieved object for pid: %s", pid ) @@ -866,11 +864,11 @@ def retrieve_metadata(self, pid: str, format_id: Optional[str] = None) -> IO[byt ) return metadata_stream else: - exception_string = ( + err_msg = ( f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" ) - logging.error(exception_string) - raise ValueError(exception_string) + logging.error(err_msg) + raise ValueError(err_msg) def delete_object(self, pid: str) -> None: logging.debug( @@ -1205,11 +1203,9 @@ def get_hex_digest(self, pid: str, algorithm: str) -> str: algorithm = self._clean_algorithm(algorithm) object_cid = self._find_object(pid).get("cid") if not self._exists(entity, object_cid): - exception_string = ( - f"FileHashStore - get_hex_digest: No object found for pid: {pid}" - ) - logging.error(exception_string) - raise ValueError(exception_string) + err_msg = f"FileHashStore - get_hex_digest: No object found for pid: {pid}" + logging.error(err_msg) + raise ValueError(err_msg) cid_stream = self._open(entity, object_cid) hex_digest = self._computehash(cid_stream, algorithm=algorithm) @@ -1404,11 +1400,11 @@ def _store_data_only(self, data: Union[str, bytes]) -> "ObjectMetadata": return object_metadata # pylint: disable=W0718 except Exception as err: - exception_string = ( + err_msg = ( "FileHashStore - _store_data_only: failed to store object." + f" Unexpected {err=}, {type(err)=}" ) - logging.error(exception_string) + logging.error(err_msg) raise err def _move_and_get_checksums( @@ -1483,11 +1479,11 @@ def _move_and_get_checksums( shutil.move(tmp_file_name, abs_file_path) except Exception as err: # Revert storage process - exception_string = ( + err_msg = ( "FileHashStore - _move_and_get_checksums:" + f" Unexpected Error: {err}" ) - logging.warning(exception_string) + logging.warning(err_msg) if os.path.isfile(abs_file_path): # Check to see if object exists before determining whether to delete debug_msg = ( @@ -1498,12 +1494,12 @@ def _move_and_get_checksums( pid_checksum = self.get_hex_digest(pid, self.algorithm) if pid_checksum == hex_digests.get(self.algorithm): # If the checksums match, return and log warning - exception_string = ( + err_msg = ( "FileHashStore - _move_and_get_checksums: Object exists at:" + f" {abs_file_path} but an unexpected issue has been encountered." + " Reference files will not be created and/or tagged." ) - logging.warning(exception_string) + logging.warning(err_msg) raise err else: debug_msg = ( @@ -1545,22 +1541,22 @@ def _move_and_get_checksums( ) except NonMatchingObjSize as nmose: # If any exception is thrown during validation, we do not tag. - exception_string = ( + err_msg = ( f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" + " , deleting temp file. Reference files will not be created and/or tagged" + f" due to an issue with the supplied pid object metadata. {str(nmose)}" ) - logging.debug(exception_string) - raise NonMatchingObjSize(exception_string) from nmose + logging.debug(err_msg) + raise NonMatchingObjSize(err_msg) from nmose except NonMatchingChecksum as nmce: # If any exception is thrown during validation, we do not tag. - exception_string = ( + err_msg = ( f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" + " , deleting temp file. Reference files will not be created and/or tagged" + f" due to an issue with the supplied pid object metadata. {str(nmce)}" ) - logging.debug(exception_string) - raise NonMatchingChecksum(exception_string) from nmce + logging.debug(err_msg) + raise NonMatchingChecksum(err_msg) from nmce finally: # Ensure that the tmp file has been removed, the data object already exists, so it # is redundant. No exception is thrown so 'store_object' can proceed to tag object @@ -1635,19 +1631,19 @@ def _write_to_tmp_file_and_get_hex_digests( return hex_digest_dict, tmp.name, tmp_file_size # pylint: disable=W0718 except Exception as err: - exception_string = ( + err_msg = ( "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + f" Unexpected {err=}, {type(err)=}" ) - logging.error(exception_string) + logging.error(err_msg) # pylint: disable=W0707,W0719 - raise Exception(exception_string) + raise Exception(err_msg) except KeyboardInterrupt: - exception_string = ( + err_msg = ( "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + " Keyboard interruption by user." ) - logging.error(exception_string) + logging.error(err_msg) if os.path.isfile(tmp.name): os.remove(tmp.name) finally: @@ -1657,12 +1653,12 @@ def _write_to_tmp_file_and_get_hex_digests( os.remove(tmp.name) # pylint: disable=W0718 except Exception as err: - exception_string = ( + err_msg = ( "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + f"Unexpected {err=} while attempting to" + f" delete tmp file: {tmp.name}, {type(err)=}" ) - logging.error(exception_string) + logging.error(err_msg) def _mktmpfile(self, path: Path) -> IO[bytes]: """Create a temporary file at the given path ready to be written. @@ -1973,10 +1969,10 @@ def _put_metadata( ) return full_path except Exception as err: - exception_string = ( + err_msg = ( f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" ) - logging.error(exception_string) + logging.error(err_msg) if os.path.isfile(metadata_tmp): # Remove tmp metadata, calling app must re-upload logging.debug( @@ -1986,12 +1982,12 @@ def _put_metadata( self._delete("metadata", metadata_tmp) raise else: - exception_string = ( + err_msg = ( f"FileHashStore - _put_metadata: Attempt to move metadata for pid: {pid}" + f", but metadata temp file not found: {metadata_tmp}" ) - logging.error(exception_string) - raise FileNotFoundError(exception_string) + logging.error(err_msg) + raise FileNotFoundError(err_msg) def _mktmpmetadata(self, stream: "Stream") -> str: """Create a named temporary file with `stream` (metadata). @@ -2129,11 +2125,11 @@ def _write_refs_file(self, path: Path, ref_id: str, ref_type: str) -> str: return tmp_file_path except Exception as err: - exception_string = ( + err_msg = ( "FileHashStore - _write_refs_file: failed to write cid refs file for pid:" + f" {ref_id} into path: {path}. Unexpected {err=}, {type(err)=}" ) - logging.error(exception_string) + logging.error(err_msg) raise err def _update_refs_file( @@ -2151,12 +2147,12 @@ def _update_refs_file( ) logging.debug(debug_msg) if not os.path.isfile(refs_file_path): - exception_string = ( + err_msg = ( f"FileHashStore - _update_refs_file: {refs_file_path} does not exist." + f" Cannot {update_type} ref_id: {ref_id}" ) - logging.error(exception_string) - raise FileNotFoundError(exception_string) + logging.error(err_msg) + raise FileNotFoundError(err_msg) try: if update_type == "add": pid_found = self._is_string_in_refs_file(ref_id, refs_file_path) @@ -2186,11 +2182,11 @@ def _update_refs_file( ) logging.debug(debug_msg) except Exception as err: - exception_string = ( + err_msg = ( f"FileHashStore - _update_refs_file: failed to {update_type} for ref_id: {ref_id}" + f" at refs file: {refs_file_path}. Unexpected {err=}, {type(err)=}" ) - logging.error(exception_string) + logging.error(err_msg) raise err @staticmethod @@ -2235,22 +2231,22 @@ def _verify_object_information( """ if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: - exception_string = ( + err_msg = ( "FileHashStore - _verify_object_information: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" + f" {file_size_to_validate}." ) if pid is not None: self._delete(entity, tmp_file_name) - exception_string_for_pid = ( - exception_string + err_msg_for_pid = ( + err_msg + f" Tmp file deleted and file not stored for pid: {pid}" ) - logging.debug(exception_string_for_pid) - raise NonMatchingObjSize(exception_string_for_pid) + logging.debug(err_msg_for_pid) + raise NonMatchingObjSize(err_msg_for_pid) else: - logging.debug(exception_string) - raise NonMatchingObjSize(exception_string) + logging.debug(err_msg) + raise NonMatchingObjSize(err_msg) if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: # Check to see if it is a supported algorithm @@ -2269,19 +2265,19 @@ def _verify_object_information( cid_stream, algorithm=checksum_algorithm ) if hex_digest_calculated != checksum: - exception_string = ( + err_msg = ( "FileHashStore - _verify_object_information: checksum_algorithm" + f" ({checksum_algorithm}) cannot be found in the default hex digests" + f" dict, but is supported. New checksum calculated: " f"{hex_digest_calculated}, does not match what has been provided: " + checksum ) - logging.debug(exception_string) - raise NonMatchingChecksum(exception_string) + logging.debug(err_msg) + raise NonMatchingChecksum(err_msg) else: hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum.lower(): - exception_string = ( + err_msg = ( "FileHashStore - _verify_object_information: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" @@ -2290,14 +2286,14 @@ def _verify_object_information( if pid is not None: # Delete the tmp file self._delete(entity, tmp_file_name) - exception_string_for_pid = ( - exception_string + f" Tmp file ({tmp_file_name}) deleted." + err_msg_for_pid = ( + err_msg + f" Tmp file ({tmp_file_name}) deleted." ) - logging.debug(exception_string_for_pid) - raise NonMatchingChecksum(exception_string_for_pid) + logging.debug(err_msg_for_pid) + raise NonMatchingChecksum(err_msg_for_pid) else: - logging.debug(exception_string) - raise NonMatchingChecksum(exception_string) + logging.debug(err_msg) + raise NonMatchingChecksum(err_msg) def _verify_hashstore_references( self, @@ -2328,42 +2324,42 @@ def _verify_hashstore_references( # Check that reference files were created if not os.path.isfile(pid_refs_path): - exception_string = ( + err_msg = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " + str(pid_refs_path) + f" . Additional Context: {additional_log_string}" ) - logging.error(exception_string) - raise PidRefsFileNotFound(exception_string) + logging.error(err_msg) + raise PidRefsFileNotFound(err_msg) if not os.path.isfile(cid_refs_path): - exception_string = ( + err_msg = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " + str(cid_refs_path) + f" . Additional Context: {additional_log_string}" ) - logging.error(exception_string) - raise CidRefsFileNotFound(exception_string) + logging.error(err_msg) + raise CidRefsFileNotFound(err_msg) # Check the content of the reference files # Start with the cid retrieved_cid = self._read_small_file_content(pid_refs_path) if retrieved_cid != cid: - exception_string = ( + err_msg = ( "FileHashStore - _verify_hashstore_references: Pid refs file exists" + f" ({pid_refs_path}) but cid ({cid}) does not match." + f" Additional Context: {additional_log_string}" ) - logging.error(exception_string) - raise PidRefsContentError(exception_string) + logging.error(err_msg) + raise PidRefsContentError(err_msg) # Then the pid pid_found = self._is_string_in_refs_file(pid, cid_refs_path) if not pid_found: - exception_string = ( + err_msg = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" + f" ({cid_refs_path}) but pid ({pid}) not found." + f" Additional Context: {additional_log_string}" ) - logging.error(exception_string) - raise CidRefsContentError(exception_string) + logging.error(err_msg) + raise CidRefsContentError(err_msg) def _delete_object_only(self, cid: str) -> None: """Attempt to delete an object based on the given content identifier (cid). If the object @@ -2458,9 +2454,9 @@ def _check_arg_format_id(self, format_id: str, method: str) -> str: :return: Valid metadata namespace. """ if format_id and not format_id.strip(): - exception_string = f"FileHashStore - {method}: Format_id cannot be empty." - logging.error(exception_string) - raise ValueError(exception_string) + err_msg = f"FileHashStore - {method}: Format_id cannot be empty." + logging.error(err_msg) + raise ValueError(err_msg) elif format_id is None: # Use default value set by hashstore config checked_format_id = self.sysmeta_ns @@ -2523,12 +2519,12 @@ def _clean_algorithm(self, algorithm_string: str) -> str: cleaned_string not in self.default_algo_list and cleaned_string not in self.other_algo_list ): - exception_string = ( + err_msg = ( "FileHashStore: _clean_algorithm: Algorithm not supported:" + cleaned_string ) - logging.error(exception_string) - raise UnsupportedAlgorithm(exception_string) + logging.error(err_msg) + raise UnsupportedAlgorithm(err_msg) return cleaned_string def _computehash( @@ -2691,10 +2687,8 @@ def _delete(self, entity: str, file: Union[str, Path]) -> None: os.remove(realpath) except Exception as err: - exception_string = ( - f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) + err_msg = f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" + logging.error(err_msg) raise err def _create_path(self, path: Path) -> None: @@ -3054,19 +3048,19 @@ def _check_arg_data(data: Union[str, os.PathLike, io.BufferedReader]) -> bool: and not isinstance(data, Path) and not isinstance(data, io.BufferedIOBase) ): - exception_string = ( + err_msg = ( "FileHashStore - _validate_arg_data: Data must be a path, string or buffered" + f" stream type. Data type supplied: {type(data)}" ) - logging.error(exception_string) - raise TypeError(exception_string) + logging.error(err_msg) + raise TypeError(err_msg) if isinstance(data, str): if data.strip() == "": - exception_string = ( + err_msg = ( "FileHashStore - _validate_arg_data: Data string cannot be empty." ) - logging.error(exception_string) - raise TypeError(exception_string) + logging.error(err_msg) + raise TypeError(err_msg) return True @staticmethod @@ -3078,18 +3072,16 @@ def _check_integer(file_size: int) -> None: """ if file_size is not None: if not isinstance(file_size, int): - exception_string = ( + err_msg = ( "FileHashStore - _check_integer: size given must be an integer." + f" File size: {file_size}. Arg Type: {type(file_size)}." ) - logging.error(exception_string) - raise TypeError(exception_string) + logging.error(err_msg) + raise TypeError(err_msg) if file_size < 1: - exception_string = ( - "FileHashStore - _check_integer: size given must be > 0" - ) - logging.error(exception_string) - raise ValueError(exception_string) + err_msg = "FileHashStore - _check_integer: size given must be > 0" + logging.error(err_msg) + raise ValueError(err_msg) @staticmethod def _check_string(string: str, arg: str) -> None: @@ -3101,12 +3093,12 @@ def _check_string(string: str, arg: str) -> None: """ if string is None or string.strip() == "" or any(ch.isspace() for ch in string): method = inspect.stack()[1].function - exception_string = ( + err_msg = ( f"FileHashStore - {method}: {arg} cannot be None" + f" or empty, {arg}: {string}." ) - logging.error(exception_string) - raise ValueError(exception_string) + logging.error(err_msg) + raise ValueError(err_msg) @staticmethod def _cast_to_bytes(text: any) -> bytes: From 6180301eb5d89605eaf74bd018f9c0fa117a4fa9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 26 Sep 2024 16:14:57 -0700 Subject: [PATCH 406/420] Clean-up logging in 'filehashstore' init related methods and delete redundant method --- src/hashstore/filehashstore.py | 128 +++++++-------------------------- 1 file changed, 26 insertions(+), 102 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 51299cdb..d6e9eb0c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -211,10 +211,7 @@ def _load_properties( - store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ if not os.path.isfile(hashstore_yaml_path): - err_msg = ( - "FileHashStore - load_properties: hashstore.yaml not found" - + " in store root path." - ) + err_msg = "'hashstore.yaml' not found in store root path." logging.critical(err_msg) raise FileNotFoundError(err_msg) @@ -227,9 +224,7 @@ def _load_properties( for key in hashstore_required_prop_keys: if key != "store_path": hashstore_yaml_dict[key] = yaml_data[key] - logging.debug( - "FileHashStore - load_properties: Successfully retrieved 'hashstore.yaml' properties." - ) + logging.debug("Successfully retrieved 'hashstore.yaml' properties.") return hashstore_yaml_dict def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: @@ -244,10 +239,7 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: """ # If hashstore.yaml already exists, must throw exception and proceed with caution if os.path.isfile(self.hashstore_configuration_yaml): - err_msg = ( - "FileHashStore - write_properties: configuration file 'hashstore.yaml'" - + " already exists." - ) + err_msg = "Configuration file 'hashstore.yaml' already exists." logging.error(err_msg) raise FileExistsError(err_msg) # Validate properties @@ -266,9 +258,8 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: checked_store_algorithm = store_algorithm else: err_msg = ( - f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" - f" cannot be used as default for HashStore. Must be one of: " - + f"{', '.join(accepted_store_algorithms)}" + f"Algorithm supplied ({store_algorithm}) cannot be used as default for" + f" HashStore. Must be one of: {', '.join(accepted_store_algorithms)}" f" which are DataONE controlled algorithm values" ) logging.error(err_msg) @@ -292,63 +283,10 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: hs_yaml_file.write(hashstore_configuration_yaml) logging.debug( - "FileHashStore - write_properties: Configuration file written to: %s", - self.hashstore_configuration_yaml, + "Configuration file written to: %s", self.hashstore_configuration_yaml ) return - @staticmethod - def _build_hashstore_yaml_string_old( - store_depth: int, - store_width: int, - store_algorithm: str, - store_metadata_namespace: str, - ) -> str: - """Build a YAML string representing the configuration for a HashStore. - - :param int store_depth: Depth when sharding an object's hex digest. - :param int store_width: Width of directories when sharding an object's hex digest. - :param str store_algorithm: Hash algorithm used for calculating the object's hex digest. - :param str store_metadata_namespace: Namespace for the HashStore's system metadata. - - :return: A YAML string representing the configuration for a HashStore. - """ - hashstore_configuration_yaml = f""" - # Default configuration variables for HashStore - - ############### Directory Structure ############### - # Desired amount of directories when sharding an object to form the permanent address - store_depth: {store_depth} # WARNING: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE - # Width of directories created when sharding an object to form the permanent address - store_width: {store_width} # WARNING: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE - # Example: - # Below, objects are shown listed in directories that are 3 levels deep (DIR_DEPTH=3), - # with each directory consisting of 2 characters (DIR_WIDTH=2). - # /var/filehashstore/objects - # ├── 7f - # │ └── 5c - # │ └── c1 - # │ └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 - - ############### Format of the Metadata ############### - # The default metadata format - store_metadata_namespace: "{store_metadata_namespace}" - - ############### Hash Algorithms ############### - # Hash algorithm to use when calculating object's hex digest for the permanent address - store_algorithm: "{store_algorithm}" - # Algorithm values supported by python hashlib 3.9.0+ for File Hash Store (FHS) - # The default algorithm list includes the hash algorithms calculated when storing an - # object to disk and returned to the caller after successful storage. - store_default_algo_list: - - "MD5" - - "SHA-1" - - "SHA-256" - - "SHA-384" - - "SHA-512" - """ - return hashstore_configuration_yaml - @staticmethod def _build_hashstore_yaml_string( store_depth: int, @@ -440,8 +378,8 @@ def _verify_hashstore_properties( :param str prop_store_path: Store path to check. """ if os.path.isfile(self.hashstore_configuration_yaml): - logging.debug( - "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", + self.fhs_logger.debug( + "Config found (hashstore.yaml) at {%s}. Verifying properties.", self.hashstore_configuration_yaml, ) # If 'hashstore.yaml' is found, verify given properties before init @@ -456,11 +394,11 @@ def _verify_hashstore_properties( supplied_key = int(properties[key]) if hashstore_yaml_dict[key] != supplied_key: err_msg = ( - f"FileHashStore - Given properties ({key}: {properties[key]}) does not" - + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + f"Given properties ({key}: {properties[key]}) does not match." + + f" HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + f" found at: {self.hashstore_configuration_yaml}" ) - logging.critical(err_msg) + self.fhs_logger.critical(err_msg) raise ValueError(err_msg) else: if os.path.exists(prop_store_path): @@ -471,12 +409,11 @@ def _verify_hashstore_properties( for sub in subfolders ): err_msg = ( - "FileHashStore - Unable to initialize HashStore. `hashstore.yaml` is not" - + " present but conflicting HashStore directory exists. Please delete" - + " '/objects', '/metadata' and/or '/refs' at the store path or supply" - + " a new path." + "Unable to initialize HashStore. `hashstore.yaml` is not present but " + "conflicting HashStore directory exists. Please delete '/objects', " + "'/metadata' and/or '/refs' at the store path or supply a new path." ) - logging.critical(err_msg) + self.fhs_logger.critical(err_msg) raise RuntimeError(err_msg) def _validate_properties( @@ -493,11 +430,8 @@ def _validate_properties( :return: The given properties object (that has been validated). """ if not isinstance(properties, dict): - err_msg = ( - "FileHashStore - _validate_properties: Invalid argument -" - + " expected a dictionary." - ) - logging.debug(err_msg) + err_msg = "Invalid argument expected a dictionary." + self.fhs_logger.error(err_msg) raise ValueError(err_msg) # New dictionary for validated properties @@ -505,20 +439,14 @@ def _validate_properties( for key in self.property_required_keys: if key not in properties: - err_msg = ( - "FileHashStore - _validate_properties: Missing required" - + f" key: {key}." - ) - logging.debug(err_msg) + err_msg = "Missing required key: {key}." + self.fhs_logger.error(err_msg) raise KeyError(err_msg) value = properties.get(key) if value is None: - err_msg = ( - "FileHashStore - _validate_properties: Value for key:" - + f" {key} is none." - ) - logging.debug(err_msg) + err_msg = "Value for key: {key} is none." + self.fhs_logger.error(err_msg) raise ValueError(err_msg) # Add key and values to checked_properties @@ -528,11 +456,10 @@ def _validate_properties( checked_properties[key] = int(value) except Exception as err: err_msg = ( - "FileHashStore - _validate_properties: Unexpected exception when" - " attempting to ensure store depth and width are integers. Details: " - + str(err) + "Unexpected exception when attempting to ensure store depth and width " + f"are integers. Details: {err}" ) - logging.debug(err_msg) + self.fhs_logger.error(err_msg) raise ValueError(err_msg) else: checked_properties[key] = value @@ -556,11 +483,8 @@ def lookup_algo(algo_to_translate): return dataone_algo_translation[algo_to_translate] if not os.path.isfile(self.hashstore_configuration_yaml): - err_msg = ( - "FileHashStore - set_default_algorithms: hashstore.yaml not found" - + " in store root path." - ) - logging.critical(err_msg) + err_msg = "hashstore.yaml not found in store root path." + self.fhs_logger.critical(err_msg) raise FileNotFoundError(err_msg) with open( From 692703b735b186a965b13b1a373538de827c44a0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 26 Sep 2024 16:40:22 -0700 Subject: [PATCH 407/420] Clean-up logging in 'filehashstore' interface methods and add missing logging messages in 'delete_object' --- src/hashstore/filehashstore.py | 324 ++++++++++++++------------------- 1 file changed, 132 insertions(+), 192 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d6e9eb0c..34523608 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -517,17 +517,14 @@ def store_object( ) -> "ObjectMetadata": if pid is None and self._check_arg_data(data): # If no pid is supplied, store the object only without tagging - logging.debug("FileHashStore - store_object: Request to store data only.") + logging.debug("Request to store data only received.") object_metadata = self._store_data_only(data) - logging.info( - "FileHashStore - store_object: Successfully stored object for cid: %s", - object_metadata.cid, + self.fhs_logger.info( + "Successfully stored object for cid: %s", object_metadata.cid ) else: # Else the object will be stored and tagged - logging.debug( - "FileHashStore - store_object: Request to store object for pid: %s", pid - ) + self.fhs_logger.debug("Request to store object for pid: %s", pid) # Validate input parameters self._check_string(pid, "pid") self._check_arg_data(data) @@ -539,34 +536,26 @@ def store_object( additional_algorithm, checksum, checksum_algorithm ) - sync_begin_debug_msg = ( - f"FileHashStore - store_object: Adding pid ({pid}) to locked list." - ) - err_msg = ( - f"FileHashStore - store_object: Duplicate object request encountered for pid: " - f"{pid}" + ". Already in progress." - ) + sync_begin_debug_msg = f"Adding pid ({pid}) to locked list." + err_msg = f"Duplicate object request encountered for pid: {pid}. Already in progress." if self.use_multiprocessing: with self.object_pid_condition_mp: # Wait for the pid to release if it's in use if pid in self.object_locked_pids_mp: - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise StoreObjectForPidAlreadyInProgress(err_msg) # Modify object_locked_pids consecutively - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.object_locked_pids_mp.append(pid) else: with self.object_pid_condition_th: if pid in self.object_locked_pids_th: logging.error(err_msg) raise StoreObjectForPidAlreadyInProgress(err_msg) - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.object_locked_pids_th.append(pid) try: - logging.debug( - "FileHashStore - store_object: Attempting to store object for pid: %s", - pid, - ) + self.fhs_logger.debug("Attempting to store object for pid: %s", pid) object_metadata = self._store_and_validate_data( pid, data, @@ -575,23 +564,16 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) - logging.debug( - "FileHashStore - store_object: Attempting to tag object for pid: %s", - pid, - ) + self.fhs_logger.debug("Attempting to tag object for pid: %s", pid) cid = object_metadata.cid self.tag_object(pid, cid) - logging.info( - "FileHashStore - store_object: Successfully stored object for pid: %s", - pid, - ) + self.fhs_logger.info("Successfully stored object for pid: %s", pid) except Exception as err: err_msg = ( - f"FileHashStore - store_object: failed to store object for pid: {pid}." - + " Reference files will not be created or tagged. Unexpected error: " - + str(err) + f"failed to store object for pid: {pid}. Reference files will not be created " + f"or tagged. Unexpected error: {err})" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise err finally: # Release pid @@ -600,27 +582,19 @@ def store_object( return object_metadata def tag_object(self, pid: str, cid: str) -> None: - logging.debug( - "FileHashStore - tag_object: Tagging object cid: %s with pid: %s.", - cid, - pid, - ) + logging.debug("Tagging object cid: %s with pid: %s.", cid, pid) self._check_string(pid, "pid") self._check_string(cid, "cid") try: self._store_hashstore_refs_files(pid, cid) except HashStoreRefsAlreadyExists as hrae: - err_msg = ( - f"FileHashStore - tag_object: reference files for pid: {pid} and {cid} " - "already exist. " + str(hrae) - ) + err_msg = f"Reference files for pid: {pid} and {cid} already exist. Details: {hrae}" + self.fhs_logger.error(err_msg) raise HashStoreRefsAlreadyExists(err_msg) except PidRefsAlreadyExistsError as praee: - err_msg = ( - f"FileHashStore - tag_object: A pid can only reference one cid. " - + str(praee) - ) + err_msg = f"A pid can only reference one cid. Details: {praee}" + self.fhs_logger.error(err_msg) raise PidRefsAlreadyExistsError(err_msg) def delete_if_invalid_object( @@ -635,15 +609,13 @@ def delete_if_invalid_object( self._check_integer(expected_file_size) if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): err_msg = ( - "FileHashStore - verify_object: 'object_metadata' cannot be None." - + " Must be a 'ObjectMetadata' object." + "'object_metadata' cannot be None. Must be a 'ObjectMetadata' object." ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise ValueError(err_msg) else: - logging.info( - "FileHashStore - verify_object: Called to verify object with id: %s", - object_metadata.cid, + self.fhs_logger.info( + "Called to verify object with id: %s", object_metadata.cid ) object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size @@ -668,17 +640,14 @@ def delete_if_invalid_object( except NonMatchingChecksum as mmce: self._delete_object_only(object_metadata.cid) raise mmce - logging.info( - "FileHashStore - verify_object: object has been validated for cid: %s", - object_metadata.cid, + self.fhs_logger.info( + "Object has been validated for cid: %s", object_metadata.cid ) def store_metadata( self, pid: str, metadata: Union[str, bytes], format_id: Optional[str] = None ) -> str: - logging.debug( - "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid - ) + self.fhs_logger.debug("Request to store metadata for pid: %s", pid) # Validate input parameters self._check_string(pid, "pid") self._check_arg_data(metadata) @@ -686,60 +655,57 @@ def store_metadata( pid_doc = self._computehash(pid + checked_format_id) sync_begin_debug_msg = ( - f"FileHashStore - store_metadata: Adding pid: {pid} to locked list, " - + f"with format_id: {checked_format_id} with doc name: {pid_doc}" + f" Adding pid: {pid} to locked list, with format_id: {checked_format_id} with doc " + f"name: {pid_doc}" ) sync_wait_msg = ( - f"FileHashStore - store_metadata: Pid: {pid} is locked for format_id:" - + f" {checked_format_id} with doc name: {pid_doc}. Waiting." + f"Pid: {pid} is locked for format_id: {checked_format_id} with doc name: {pid_doc}. " + f"Waiting." ) if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use while pid_doc in self.metadata_locked_docs_mp: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.metadata_condition_mp.wait() # Modify metadata_locked_docs consecutively - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.metadata_locked_docs_mp.append(pid_doc) else: with self.metadata_condition_th: while pid_doc in self.metadata_locked_docs_th: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.metadata_condition_th.wait() - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.metadata_locked_docs_th.append(pid_doc) try: metadata_cid = self._put_metadata(metadata, pid, pid_doc) info_msg = ( - "FileHashStore - store_metadata: Successfully stored metadata for" - + f" pid: {pid} with format_id: {checked_format_id}" + f"Successfully stored metadata for pid: {pid} with format_id: " + + checked_format_id ) - logging.info(info_msg) + self.fhs_logger.info(info_msg) return str(metadata_cid) finally: # Release pid end_sync_debug_msg = ( - f"FileHashStore - store_metadata: Releasing pid doc ({pid_doc})" - + f" from locked list for pid: {pid} with format_id: {checked_format_id}" + f"Releasing pid doc ({pid_doc}) from locked list for pid: {pid} with format_id: " + + checked_format_id ) if self.use_multiprocessing: with self.metadata_condition_mp: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: with self.metadata_condition_th: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.metadata_locked_docs_th.remove(pid_doc) self.metadata_condition_th.notify() def retrieve_object(self, pid: str) -> IO[bytes]: - logging.debug( - "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", - pid, - ) + self.fhs_logger.debug("Request to retrieve object for pid: %s", pid) self._check_string(pid, "pid") object_info_dict = self._find_object(pid) @@ -747,26 +713,20 @@ def retrieve_object(self, pid: str) -> IO[bytes]: entity = "objects" if object_cid: - logging.debug( - "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", - pid, + self.fhs_logger.debug( + "Metadata exists for pid: %s, retrieving object.", pid ) obj_stream = self._open(entity, object_cid) else: - err_msg = f"FileHashStore - retrieve_object: No object found for pid: {pid}" - logging.error(err_msg) + err_msg = f"No object found for pid: {pid}" + self.fhs_logger.error(err_msg) raise ValueError(err_msg) - logging.info( - "FileHashStore - retrieve_object: Retrieved object for pid: %s", pid - ) + self.fhs_logger.info("Retrieved object for pid: %s", pid) return obj_stream def retrieve_metadata(self, pid: str, format_id: Optional[str] = None) -> IO[bytes]: - logging.debug( - "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", - pid, - ) + self.fhs_logger.debug("Request to retrieve metadata for pid: %s", pid) self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata") @@ -783,21 +743,15 @@ def retrieve_metadata(self, pid: str, format_id: Optional[str] = None) -> IO[byt if metadata_exists: metadata_stream = self._open(entity, str(metadata_rel_path)) - logging.info( - "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid - ) + self.fhs_logger.info("Retrieved metadata for pid: %s", pid) return metadata_stream else: - err_msg = ( - f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" - ) - logging.error(err_msg) + err_msg = f"No metadata found for pid: {pid}" + self.fhs_logger.error(err_msg) raise ValueError(err_msg) def delete_object(self, pid: str) -> None: - logging.debug( - "FileHashStore - delete_object: Request to delete object for id: %s", pid - ) + self.fhs_logger.debug("Request to delete object for id: %s", pid) self._check_string(pid, "pid") objects_to_delete = [] @@ -805,27 +759,23 @@ def delete_object(self, pid: str) -> None: # Storing and deleting objects are synchronized together # Duplicate store object requests for a pid are rejected, but deleting an object # will wait for a pid to be released if it's found to be in use before proceeding. - sync_begin_debug_msg = ( - f"FileHashStore - delete_object: Pid ({pid}) to locked list." - ) - sync_wait_msg = ( - f"FileHashStore - delete_object: Pid ({pid}) is locked. Waiting." - ) + sync_begin_debug_msg = f"Pid ({pid}) to locked list." + sync_wait_msg = f"Pid ({pid}) is locked. Waiting." if self.use_multiprocessing: with self.object_pid_condition_mp: # Wait for the pid to release if it's in use while pid in self.object_locked_pids_mp: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.object_pid_condition_mp.wait() # Modify object_locked_pids consecutively - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.object_locked_pids_mp.append(pid) else: with self.object_pid_condition_th: while pid in self.object_locked_pids_th: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.object_pid_condition_th.wait() - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.object_locked_pids_th.append(pid) try: @@ -839,28 +789,23 @@ def delete_object(self, pid: str) -> None: # Proceed with next steps - cid has been retrieved without any issues # We must synchronize here based on the `cid` because multiple threads may # try to access the `cid_reference_file` - sync_begin_debug_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) to locked list." - ) - sync_wait_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) is locked." - + " Waiting." - ) + sync_begin_debug_msg = f"Cid ({cid}) to locked list." + sync_wait_msg = f"Cid ({cid}) is locked. Waiting." if self.use_multiprocessing: with self.object_cid_condition_mp: # Wait for the cid to release if it's in use while cid in self.object_locked_cids_mp: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.object_cid_condition_mp.wait() # Modify reference_locked_cids consecutively - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.object_locked_cids_mp.append(cid) else: with self.object_cid_condition_th: while cid in self.object_locked_cids_th: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.object_cid_condition_th.wait() - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.object_locked_cids_th.append(cid) try: @@ -875,10 +820,10 @@ def delete_object(self, pid: str) -> None: # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: debug_msg = ( - "FileHashStore - delete_object: cid_refs_file is empty (size == 0):" - + f" {cid_ref_abs_path} - deleting cid refs file and data object." + f"Cid reference file is empty (size == 0): {cid_ref_abs_path} - " + + "deleting cid reference file and data object." ) - logging.debug(debug_msg) + self.fhs_logger.debug(debug_msg) objects_to_delete.append( self._rename_path_for_deletion(cid_ref_abs_path) ) @@ -893,36 +838,32 @@ def delete_object(self, pid: str) -> None: self.delete_metadata(pid) info_string = ( - "FileHashStore - delete_object: Successfully deleted references," - + f" metadata and object associated with pid: {pid}" + f"Successfully deleted references, metadata and object associated" + + f" with pid: {pid}" ) - logging.info(info_string) + self.fhs_logger.info(info_string) return finally: # Release cid - end_sync_debug_msg = ( - f"FileHashStore - delete_object: Releasing cid ({cid})" - + " from locked list" - ) + end_sync_debug_msg = f"Releasing cid ({cid}) from locked list" if self.use_multiprocessing: with self.object_cid_condition_mp: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.object_locked_cids_mp.remove(cid) self.object_cid_condition_mp.notify() else: with self.object_cid_condition_th: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.object_locked_cids_th.remove(cid) self.object_cid_condition_th.notify() except PidRefsDoesNotExist: warn_msg = ( - "FileHashStore - delete_object: pid refs file does not exist for pid: " - + pid - + ". Skipping object deletion. Deleting pid metadata documents." + f"Pid reference file does not exist for pid: {pid} Skipping object deletion. " + + "Deleting pid metadata documents." ) - logging.warning(warn_msg) + self.fhs_logger.warning(warn_msg) # Remove metadata files if they exist self.delete_metadata(pid) @@ -931,6 +872,12 @@ def delete_object(self, pid: str) -> None: self._delete_marked_files(objects_to_delete) return except OrphanPidRefsFileFound: + warn_msg = ( + f"Orphan pid reference file found for pid: {pid}. Skipping object deletion. " + + "Deleting pid reference file and related metadata documents." + ) + self.fhs_logger.warning(warn_msg) + # Delete pid refs file pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( @@ -942,6 +889,13 @@ def delete_object(self, pid: str) -> None: self._delete_marked_files(objects_to_delete) return except RefsFileExistsButCidObjMissing: + warn_msg = ( + f"Reference files exist for pid: {pid}, but the data object is missing. " + + "Deleting pid reference file & related metadata documents. Handling cid " + + "reference file." + ) + self.fhs_logger.warning(warn_msg) + # Add pid refs file to be permanently deleted pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( @@ -959,6 +913,12 @@ def delete_object(self, pid: str) -> None: self._delete_marked_files(objects_to_delete) return except PidNotFoundInCidRefsFile: + warn_msg = ( + f"Pid {pid} not found in cid reference file. Deleting pid reference " + + "file and related metadata documents." + ) + self.fhs_logger.warning(warn_msg) + # Add pid refs file to be permanently deleted pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( @@ -971,27 +931,21 @@ def delete_object(self, pid: str) -> None: return finally: # Release pid - end_sync_debug_msg = ( - f"FileHashStore - delete_object: Releasing pid ({pid})" - + " from locked list" - ) + end_sync_debug_msg = f"Releasing pid ({pid}) from locked list" if self.use_multiprocessing: with self.object_pid_condition_mp: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.object_locked_pids_mp.remove(pid) self.object_pid_condition_mp.notify() else: # Release pid with self.object_pid_condition_th: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: - logging.debug( - "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", - pid, - ) + self.fhs_logger.debug("Request to delete metadata for pid: %s", pid) self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") metadata_directory = self._computehash(pid) @@ -1010,28 +964,28 @@ def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: # Synchronize based on doc name # Wait for the pid to release if it's in use sync_begin_debug_msg = ( - f"FileHashStore - delete_metadata: Adding pid: {pid} to locked list, " - + f"with format_id: {checked_format_id} with doc name: {pid_doc}" + f"Adding pid: {pid} to locked list, with format_id: {checked_format_id} " + + f"with doc name: {pid_doc}" ) sync_wait_msg = ( - f"FileHashStore - delete_metadata: Pid: {pid} is locked for format_id:" - + f" {checked_format_id} with doc name: {pid_doc}. Waiting." + f"Pid: {pid} is locked for format_id: {checked_format_id} with doc name:" + + f" {pid_doc}. Waiting." ) if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use while pid in self.metadata_locked_docs_mp: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.metadata_condition_mp.wait() # Modify metadata_locked_docs consecutively - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.metadata_locked_docs_mp.append(pid_doc) else: with self.metadata_condition_th: while pid in self.metadata_locked_docs_th: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.metadata_condition_th.wait() - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.metadata_locked_docs_th.append(pid_doc) try: # Mark metadata doc for deletion @@ -1039,87 +993,76 @@ def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: finally: # Release pid end_sync_debug_msg = ( - f"FileHashStore - delete_metadata: Releasing pid doc ({pid_doc})" - + f" from locked list for pid: {pid} with format_id:" - + checked_format_id + f"Releasing pid doc ({pid_doc}) from locked list for pid: {pid} with " + + f"format_id: {checked_format_id}" ) if self.use_multiprocessing: with self.metadata_condition_mp: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: with self.metadata_condition_th: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.metadata_locked_docs_th.remove(pid_doc) self.metadata_condition_th.notify() # Delete metadata objects self._delete_marked_files(objects_to_delete) - info_string = ( - "FileHashStore - delete_metadata: Successfully deleted all metadata" - + f"for pid: {pid}", - ) - logging.info(info_string) + info_string = ("Successfully deleted all metadata for pid: {pid}",) + self.fhs_logger.info(info_string) else: # Delete a specific metadata file pid_doc = self._computehash(pid + checked_format_id) # Wait for the pid to release if it's in use sync_begin_debug_msg = ( - f"FileHashStore - delete_metadata: Adding pid: {pid} to locked list, " - + f"with format_id: {checked_format_id} with doc name: {pid_doc}" + f"Adding pid: {pid} to locked list, with format_id: {checked_format_id} with doc " + + f"name: {pid_doc}" ) sync_wait_msg = ( - f"FileHashStore - delete_metadata: Pid: {pid} is locked for format_id:" - + f" {checked_format_id} with doc name: {pid_doc}. Waiting." + f"Pid: {pid} is locked for format_id: {checked_format_id} with doc name:" + + f" {pid_doc}. Waiting." ) if self.use_multiprocessing: with self.metadata_condition_mp: # Wait for the pid to release if it's in use while pid in self.metadata_locked_docs_mp: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.metadata_condition_mp.wait() # Modify metadata_locked_docs consecutively - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.metadata_locked_docs_mp.append(pid_doc) else: with self.metadata_condition_th: while pid in self.metadata_locked_docs_th: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.metadata_condition_th.wait() - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.metadata_locked_docs_th.append(pid_doc) try: full_path_without_directory = Path(self.metadata / rel_path / pid_doc) self._delete("metadata", full_path_without_directory) - info_string = ( - "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" - + f" {pid} for format_id: {format_id}" - ) - logging.info(info_string) + info_string = f"Successfully deleted metadata for pid: {pid} for format_id: {format_id}" + self.fhs_logger.info(info_string) finally: # Release pid end_sync_debug_msg = ( - f"FileHashStore - delete_metadata: Releasing pid doc ({pid_doc})" - + f" from locked list for pid: {pid} with format_id:" - + checked_format_id + f"Releasing pid doc ({pid_doc}) from locked list for pid: {pid} with " + f"format_id: {checked_format_id}" ) if self.use_multiprocessing: with self.metadata_condition_mp: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.metadata_locked_docs_mp.remove(pid_doc) self.metadata_condition_mp.notify() else: with self.metadata_condition_th: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.metadata_locked_docs_th.remove(pid_doc) self.metadata_condition_th.notify() def get_hex_digest(self, pid: str, algorithm: str) -> str: - logging.debug( - "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", - pid, - ) + self.fhs_logger.debug("Request to get hex digest for object with pid: %s", pid) self._check_string(pid, "pid") self._check_string(algorithm, "algorithm") @@ -1127,16 +1070,13 @@ def get_hex_digest(self, pid: str, algorithm: str) -> str: algorithm = self._clean_algorithm(algorithm) object_cid = self._find_object(pid).get("cid") if not self._exists(entity, object_cid): - err_msg = f"FileHashStore - get_hex_digest: No object found for pid: {pid}" - logging.error(err_msg) + err_msg = f"No object found for pid: {pid}" + self.fhs_logger.error(err_msg) raise ValueError(err_msg) cid_stream = self._open(entity, object_cid) hex_digest = self._computehash(cid_stream, algorithm=algorithm) - info_string = ( - f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." - + f" Hex Digest: {hex_digest}", - ) + info_string = f"Successfully calculated hex digest for pid: {pid}. Hex Digest: {hex_digest}" logging.info(info_string) return hex_digest From 47f6556f4edef297506256bacf75ffd3533118d2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 26 Sep 2024 16:57:06 -0700 Subject: [PATCH 408/420] Fix bug in 'delete_object' where synchronization call was not made in try block, and another bug where a cid was not locked during an exception scenario --- src/hashstore/filehashstore.py | 121 +++++++++++++-------------------- 1 file changed, 47 insertions(+), 74 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 34523608..a4267546 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -761,27 +761,13 @@ def delete_object(self, pid: str) -> None: # will wait for a pid to be released if it's found to be in use before proceeding. sync_begin_debug_msg = f"Pid ({pid}) to locked list." sync_wait_msg = f"Pid ({pid}) is locked. Waiting." - if self.use_multiprocessing: - with self.object_pid_condition_mp: - # Wait for the pid to release if it's in use - while pid in self.object_locked_pids_mp: - self.fhs_logger.debug(sync_wait_msg) - self.object_pid_condition_mp.wait() - # Modify object_locked_pids consecutively - self.fhs_logger.debug(sync_begin_debug_msg) - self.object_locked_pids_mp.append(pid) - else: - with self.object_pid_condition_th: - while pid in self.object_locked_pids_th: - self.fhs_logger.debug(sync_wait_msg) - self.object_pid_condition_th.wait() - self.fhs_logger.debug(sync_begin_debug_msg) - self.object_locked_pids_th.append(pid) try: # Before we begin deletion process, we look for the `cid` by calling # `find_object` which will throw custom exceptions if there is an issue with # the reference files, which help us determine the path to proceed with. + self._synchronize_object_locked_pids(pid) + try: object_info_dict = self._find_object(pid) cid = object_info_dict.get("cid") @@ -789,24 +775,7 @@ def delete_object(self, pid: str) -> None: # Proceed with next steps - cid has been retrieved without any issues # We must synchronize here based on the `cid` because multiple threads may # try to access the `cid_reference_file` - sync_begin_debug_msg = f"Cid ({cid}) to locked list." - sync_wait_msg = f"Cid ({cid}) is locked. Waiting." - if self.use_multiprocessing: - with self.object_cid_condition_mp: - # Wait for the cid to release if it's in use - while cid in self.object_locked_cids_mp: - self.fhs_logger.debug(sync_wait_msg) - self.object_cid_condition_mp.wait() - # Modify reference_locked_cids consecutively - self.fhs_logger.debug(sync_begin_debug_msg) - self.object_locked_cids_mp.append(cid) - else: - with self.object_cid_condition_th: - while cid in self.object_locked_cids_th: - self.fhs_logger.debug(sync_wait_msg) - self.object_cid_condition_th.wait() - self.fhs_logger.debug(sync_begin_debug_msg) - self.object_locked_cids_th.append(cid) + self._synchronize_object_locked_cids(cid) try: cid_ref_abs_path = object_info_dict.get("cid_refs_path") @@ -846,31 +815,8 @@ def delete_object(self, pid: str) -> None: finally: # Release cid - end_sync_debug_msg = f"Releasing cid ({cid}) from locked list" - if self.use_multiprocessing: - with self.object_cid_condition_mp: - self.fhs_logger.debug(end_sync_debug_msg) - self.object_locked_cids_mp.remove(cid) - self.object_cid_condition_mp.notify() - else: - with self.object_cid_condition_th: - self.fhs_logger.debug(end_sync_debug_msg) - self.object_locked_cids_th.remove(cid) - self.object_cid_condition_th.notify() - - except PidRefsDoesNotExist: - warn_msg = ( - f"Pid reference file does not exist for pid: {pid} Skipping object deletion. " - + "Deleting pid metadata documents." - ) - self.fhs_logger.warning(warn_msg) + self._release_object_locked_cids(cid) - # Remove metadata files if they exist - self.delete_metadata(pid) - - # Remove all files confirmed for deletion - self._delete_marked_files(objects_to_delete) - return except OrphanPidRefsFileFound: warn_msg = ( f"Orphan pid reference file found for pid: {pid}. Skipping object deletion. " @@ -903,10 +849,16 @@ def delete_object(self, pid: str) -> None: ) # Remove pid from cid refs file pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) - cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) - # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "remove") + try: + self._synchronize_object_locked_cids(pid_refs_cid) + + cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) + # Remove if the pid refs is found + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "remove") + finally: + self._release_object_locked_cids(pid_refs_cid) + # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion @@ -931,18 +883,7 @@ def delete_object(self, pid: str) -> None: return finally: # Release pid - end_sync_debug_msg = f"Releasing pid ({pid}) from locked list" - if self.use_multiprocessing: - with self.object_pid_condition_mp: - self.fhs_logger.debug(end_sync_debug_msg) - self.object_locked_pids_mp.remove(pid) - self.object_pid_condition_mp.notify() - else: - # Release pid - with self.object_pid_condition_th: - self.fhs_logger.debug(end_sync_debug_msg) - self.object_locked_pids_th.remove(pid) - self.object_pid_condition_th.notify() + self._release_object_locked_pids(pid) def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: self.fhs_logger.debug("Request to delete metadata for pid: %s", pid) @@ -2681,6 +2622,38 @@ def _get_hashstore_cid_refs_path(self, cid: str) -> Path: # Synchronization Methods + def _synchronize_object_locked_pids(self, pid: str) -> None: + """Threads must work with 'pid's one identifier at a time to ensure thread safety when + handling requests to store, delete or tag pids. + + :param str pid: Persistent or authority-based identifier + """ + if self.use_multiprocessing: + with self.object_pid_condition_mp: + # Wait for the cid to release if it's being tagged + while pid in self.object_locked_pids_mp: + logging.debug( + f"_synchronize_object_locked_pids: Pid ({pid}) is locked. Waiting." + ) + self.object_pid_condition_mp.wait() + self.object_locked_pids_mp.append(pid) + logging.debug( + f"_synchronize_object_locked_pids: Synchronizing object_locked_pids_mp for" + + f" pid: {pid}" + ) + else: + with self.object_pid_condition_th: + while pid in self.object_locked_pids_th: + logging.debug( + f"_synchronize_object_locked_pids: Pid ({pid}) is locked. Waiting." + ) + self.object_pid_condition_th.wait() + self.object_locked_pids_th.append(pid) + logging.debug( + f"_synchronize_object_locked_pids: Synchronizing object_locked_pids_th for" + + f" cid: {pid}" + ) + def _release_object_locked_pids(self, pid: str) -> None: """Remove the given persistent identifier from 'object_locked_pids' and notify other waiting threads or processes. From f70243e57d40ca74f4a7cc2d7ad55b4cd460f3cd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 26 Sep 2024 17:23:23 -0700 Subject: [PATCH 409/420] Fix typo in logging message in '_synchronize_object_locked_pids' --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a4267546..0213e499 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2651,7 +2651,7 @@ def _synchronize_object_locked_pids(self, pid: str) -> None: self.object_locked_pids_th.append(pid) logging.debug( f"_synchronize_object_locked_pids: Synchronizing object_locked_pids_th for" - + f" cid: {pid}" + + f" pid: {pid}" ) def _release_object_locked_pids(self, pid: str) -> None: From 276c6e74769ed9dbcc16b92ee85d32dd2f9a2aeb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 27 Sep 2024 09:15:54 -0700 Subject: [PATCH 410/420] Clean-up logging in 'filehashstore' supporting and core methods part. 1 --- src/hashstore/filehashstore.py | 397 ++++++++++++--------------------- 1 file changed, 148 insertions(+), 249 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 0213e499..14fb9cb5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1037,9 +1037,7 @@ def _find_object(self, pid: str) -> Dict[str, str]: - pid_refs_path: path to the pid refs file - sysmeta_path: path to the sysmeta file """ - logging.debug( - "FileHashStore - find_object: Request to find object for for pid: %s", pid - ) + self.fhs_logger.debug("Request to find object for for pid: %s", pid) self._check_string(pid, "pid") pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) @@ -1055,11 +1053,10 @@ def _find_object(self, pid: str) -> Dict[str, str]: # Object must also exist in order to return the cid retrieved if not self._exists("objects", pid_refs_cid): err_msg = ( - f"FileHashStore - find_object: Refs file found for pid ({pid}) at" - + str(pid_ref_abs_path) + f"Reference file found for pid ({pid}) at {pid_ref_abs_path}" + f", but object referenced does not exist, cid: {pid_refs_cid}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise RefsFileExistsButCidObjMissing(err_msg) else: sysmeta_doc_name = self._computehash(pid + self.sysmeta_ns) @@ -1087,25 +1084,23 @@ def _find_object(self, pid: str) -> Dict[str, str]: else: # If not, it is an orphan pid refs file err_msg = ( - "FileHashStore - find_object: pid refs file exists with cid: " - + f"{pid_refs_cid} for pid: {pid} but is missing from cid refs file:" - + str(cid_ref_abs_path) + f"Pid reference file exists with cid: {pid_refs_cid} for pid: {pid} but " + f"is missing from cid refs file: {cid_ref_abs_path}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise PidNotFoundInCidRefsFile(err_msg) else: err_msg = ( - f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" - + f", but cid refs file not found: {cid_ref_abs_path} for pid: {pid}" + f"Pid reference file exists with cid: {pid_refs_cid} but cid reference file " + + f"not found: {cid_ref_abs_path} for pid: {pid}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise OrphanPidRefsFileFound(err_msg) else: err_msg = ( - f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " - + str(pid_ref_abs_path) + f"Pid reference file not found for pid ({pid}): {pid_ref_abs_path}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise PidRefsDoesNotExist(err_msg) def _store_and_validate_data( @@ -1134,9 +1129,7 @@ def _store_and_validate_data( """ stream = Stream(file) - logging.debug( - "FileHashStore - put_object: Request to put object for pid: %s", pid - ) + self.fhs_logger.debug("Request to put object for pid: %s", pid) with closing(stream): ( object_cid, @@ -1154,10 +1147,7 @@ def _store_and_validate_data( object_metadata = ObjectMetadata( pid, object_cid, obj_file_size, hex_digest_dict ) - logging.debug( - "FileHashStore - put_object: Successfully put object for pid: %s", - pid, - ) + self.fhs_logger.debug("Successfully put object for pid: %s", pid) return object_metadata def _store_data_only(self, data: Union[str, bytes]) -> "ObjectMetadata": @@ -1174,9 +1164,7 @@ def _store_data_only(self, data: Union[str, bytes]) -> "ObjectMetadata": :return: ObjectMetadata - object that contains the object ID, object file size, and hex digest dictionary. """ - logging.debug( - "FileHashStore - _store_data_only: Request to store data object only." - ) + self.fhs_logger.debug("Request to store data object only.") try: # Ensure the data is a stream @@ -1198,18 +1186,12 @@ def _store_data_only(self, data: Union[str, bytes]) -> "ObjectMetadata": ) # The permanent address of the data stored is based on the data's checksum cid = hex_digest_dict.get(self.algorithm) - logging.debug( - "FileHashStore - _store_data_only: Successfully stored object with cid: %s", - cid, - ) + self.fhs_logger.debug("Successfully stored object with cid: %s", cid) return object_metadata # pylint: disable=W0718 except Exception as err: - err_msg = ( - "FileHashStore - _store_data_only: failed to store object." - + f" Unexpected {err=}, {type(err)=}" - ) - logging.error(err_msg) + err_msg = f"Failed to store object. Unexpected {err=}, {type(err)=}" + self.fhs_logger.error(err_msg) raise err def _move_and_get_checksums( @@ -1239,11 +1221,8 @@ def _move_and_get_checksums( :return: tuple - Object ID, object file size, and hex digest dictionary. """ - debug_msg = ( - "FileHashStore - _move_and_get_checksums: Creating temp" - + f" file and calculating checksums for pid: {pid}" - ) - logging.debug(debug_msg) + debug_msg = f"Creating temp file and calculating checksums for pid: {pid}" + self.fhs_logger.debug(debug_msg) ( hex_digests, tmp_file_name, @@ -1251,10 +1230,7 @@ def _move_and_get_checksums( ) = self._write_to_tmp_file_and_get_hex_digests( stream, additional_algorithm, checksum_algorithm ) - logging.debug( - "FileHashStore - _move_and_get_checksums: Temp file created: %s", - tmp_file_name, - ) + self.fhs_logger.debug("Temp file created: %s", tmp_file_name) # Objects are stored with their content identifier based on the store algorithm object_cid = hex_digests.get(self.algorithm) @@ -1276,60 +1252,46 @@ def _move_and_get_checksums( ) self._create_path(Path(os.path.dirname(abs_file_path))) try: - debug_msg = ( - "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" - + f" location: {abs_file_path}", - ) - logging.debug(debug_msg) + debug_msg = f"Moving temp file to permanent location: {abs_file_path}" + self.fhs_logger.debug(debug_msg) shutil.move(tmp_file_name, abs_file_path) except Exception as err: # Revert storage process - err_msg = ( - "FileHashStore - _move_and_get_checksums:" - + f" Unexpected Error: {err}" - ) - logging.warning(err_msg) + err_msg = f" Unexpected Error: {err}" + self.fhs_logger.warning(err_msg) if os.path.isfile(abs_file_path): # Check to see if object exists before determining whether to delete debug_msg = ( - "FileHashStore - _move_and_get_checksums: Permanent file" - + f" found during exception, checking hex digest for pid: {pid}" + f"Permanent file found, checking hex digest for pid: {pid}" ) - logging.debug(debug_msg) + self.fhs_logger.debug(debug_msg) pid_checksum = self.get_hex_digest(pid, self.algorithm) if pid_checksum == hex_digests.get(self.algorithm): # If the checksums match, return and log warning err_msg = ( - "FileHashStore - _move_and_get_checksums: Object exists at:" - + f" {abs_file_path} but an unexpected issue has been encountered." - + " Reference files will not be created and/or tagged." + f"Object exists at: {abs_file_path} but an unexpected issue has been " + + "encountered. Reference files will not be created and/or tagged." ) - logging.warning(err_msg) + self.fhs_logger.warning(err_msg) raise err else: debug_msg = ( - "FileHashStore - _move_and_get_checksums: Object exists at" - + f"{abs_file_path} but the pid object checksum provided does not" - + " match what has been calculated. Deleting object. References will" - + " not be created and/or tagged.", + f"Object exists at {abs_file_path} but the pid object checksum " + + "provided does not match what has been calculated. Deleting object. " + + "References will not be created and/or tagged.", ) - logging.debug(debug_msg) + self.fhs_logger.debug(debug_msg) self._delete("objects", abs_file_path) raise err else: - logging.debug( - "FileHashStore - _move_and_get_checksums: Deleting temporary file: %s", - tmp_file_name, - ) + self.fhs_logger.debug("Deleting temporary file: %s", tmp_file_name) self._delete("tmp", tmp_file_name) err_msg = ( f"Object has not been stored for pid: {pid} - an unexpected error has " - f"occurred when moving tmp file to: {object_cid}. Reference files will " - f"not be created and/or tagged. Error: {err}" - ) - logging.warning( - "FileHashStore - _move_and_get_checksums: %s", err_msg + + f"occurred when moving tmp file to: {object_cid}. Reference files will " + + f"not be created and/or tagged. Error: {err}" ) + self.fhs_logger.warning(err_msg) raise else: # If the data object already exists, do not move the file but attempt to verify it @@ -1347,20 +1309,20 @@ def _move_and_get_checksums( except NonMatchingObjSize as nmose: # If any exception is thrown during validation, we do not tag. err_msg = ( - f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" - + " , deleting temp file. Reference files will not be created and/or tagged" - + f" due to an issue with the supplied pid object metadata. {str(nmose)}" + f"Object already exists for pid: {pid}, deleting temp file. Reference files " + + "will not be created and/or tagged due to an issue with the supplied pid " + + f"object metadata. {str(nmose)}" ) - logging.debug(err_msg) + self.fhs_logger.debug(err_msg) raise NonMatchingObjSize(err_msg) from nmose except NonMatchingChecksum as nmce: # If any exception is thrown during validation, we do not tag. err_msg = ( - f"FileHashStore - _move_and_get_checksums: Object already exists for pid: {pid}" - + " , deleting temp file. Reference files will not be created and/or tagged" - + f" due to an issue with the supplied pid object metadata. {str(nmce)}" + f"Object already exists for pid: {pid}, deleting temp file. Reference files " + + "will not be created and/or tagged due to an issue with the supplied pid " + + f"object metadata. {str(nmce)}" ) - logging.debug(err_msg) + self.fhs_logger.debug(err_msg) raise NonMatchingChecksum(err_msg) from nmce finally: # Ensure that the tmp file has been removed, the data object already exists, so it @@ -1397,10 +1359,8 @@ def _write_to_tmp_file_and_get_hex_digests( tmp_root_path = self._get_store_path("objects") / "tmp" tmp = self._mktmpfile(tmp_root_path) - logging.debug( - "FileHashStore - _write_to_tmp_file_and_get_hex_digests: tmp file created:" - + " %s, calculating hex digests.", - tmp.name, + self.fhs_logger.debug( + "Tmp file created: %s, calculating hex digests.", tmp.name ) tmp_file_completion_flag = False @@ -1416,10 +1376,8 @@ def _write_to_tmp_file_and_get_hex_digests( for hash_algorithm in hash_algorithms: hash_algorithm.update(self._cast_to_bytes(data)) - logging.debug( - "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Object stream" - + " successfully written to tmp file: %s", - tmp.name, + self.fhs_logger.debug( + "Object stream successfully written to tmp file: %s", tmp.name ) hex_digest_list = [ @@ -1430,25 +1388,17 @@ def _write_to_tmp_file_and_get_hex_digests( # Ready for validation and atomic move tmp_file_completion_flag = True - logging.debug( - "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Hex digests calculated." - ) + self.fhs_logger.debug("Hex digests calculated.") return hex_digest_dict, tmp.name, tmp_file_size # pylint: disable=W0718 except Exception as err: - err_msg = ( - "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" - + f" Unexpected {err=}, {type(err)=}" - ) - logging.error(err_msg) + err_msg = f"Unexpected {err=}, {type(err)=}" + self.fhs_logger.error(err_msg) # pylint: disable=W0707,W0719 raise Exception(err_msg) except KeyboardInterrupt: - err_msg = ( - "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" - + " Keyboard interruption by user." - ) - logging.error(err_msg) + err_msg = "Keyboard interruption by user." + self.fhs_logger.error(err_msg) if os.path.isfile(tmp.name): os.remove(tmp.name) finally: @@ -1459,11 +1409,10 @@ def _write_to_tmp_file_and_get_hex_digests( # pylint: disable=W0718 except Exception as err: err_msg = ( - "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" - + f"Unexpected {err=} while attempting to" - + f" delete tmp file: {tmp.name}, {type(err)=}" + f"Unexpected {err=} while attempting to delete tmp file: " + + f"{tmp.name}, {type(err)=}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) def _mktmpfile(self, path: Path) -> IO[bytes]: """Create a temporary file at the given path ready to be written. @@ -1519,8 +1468,7 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: # are expected to be and throw an exception to inform the client that everything # is in place - and include other issues for context err_msg = ( - f"FileHashStore - store_hashstore_refs_files: Object with cid: {cid}" - f" already exists and is tagged with pid: {pid}." + f"Object with cid: {cid} exists and is tagged with pid: {pid}." ) try: self._verify_hashstore_references( @@ -1530,11 +1478,11 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: cid_refs_path, "Refs file already exists, verifying.", ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise HashStoreRefsAlreadyExists(err_msg) except Exception as e: rev_msg = err_msg + " " + str(e) - logging.error(rev_msg) + self.fhs_logger.error(rev_msg) raise HashStoreRefsAlreadyExists(err_msg) elif os.path.isfile(pid_refs_path) and not os.path.isfile( @@ -1542,21 +1490,18 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: ): # If pid refs exists, the pid has already been claimed and cannot be tagged we # throw an exception immediately - error_msg = ( - f"FileHashStore - store_hashstore_refs_files: Pid refs file already exists" - f" for pid: {pid}." - ) - logging.error(error_msg) + error_msg = f"Pid refs file already exists for pid: {pid}." + self.fhs_logger.error(error_msg) raise PidRefsAlreadyExistsError(error_msg) elif not os.path.isfile(pid_refs_path) and os.path.isfile( cid_refs_path ): debug_msg = ( - f"FileHashStore - store_hashstore_refs_files: pid refs file does not exist" - f" for pid {pid} but cid refs file found at: {cid_refs_path} for cid: {cid}" + f"Pid reference file does not exist for pid {pid} but cid refs file " + + f"found at: {cid_refs_path} for cid: {cid}" ) - logging.debug(debug_msg) + self.fhs_logger.debug(debug_msg) # Move the pid refs file pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") shutil.move(pid_tmp_file_path, pid_refs_path) @@ -1570,11 +1515,8 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: cid_refs_path, f"Updated existing cid refs file: {cid_refs_path} with pid: {pid}", ) - info_msg = ( - "FileHashStore - store_hashstore_refs_files: Successfully updated " - f"cid: {cid} with pid: {pid}" - ) - logging.info(info_msg) + info_msg = f"Successfully updated cid: {cid} with pid: {pid}" + self.fhs_logger.info(info_msg) return # Move both files after checking the existing status of refs files @@ -1586,11 +1528,8 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: self._verify_hashstore_references( pid, cid, pid_refs_path, cid_refs_path, log_msg ) - info_msg = ( - "FileHashStore - store_hashstore_refs_files: Successfully updated " - f"cid: {cid} with pid: {pid}" - ) - logging.info(info_msg) + info_msg = f"Successfully updated cid: {cid} with pid: {pid}" + self.fhs_logger.info(info_msg) except ( HashStoreRefsAlreadyExists, @@ -1598,11 +1537,13 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: ) as expected_exceptions: raise expected_exceptions - except Exception as unexpected_exception: + except Exception as ue: # For all other unexpected exceptions, we are to revert the tagging process as # much as possible. No exceptions from the reverting process will be thrown. + err_msg = f"Unexpected exception: {ue}, reverting tagging process (untag obj)." + self.fhs_logger.error(err_msg) self._untag_object(pid, cid) - raise unexpected_exception + raise ue finally: # Release cid @@ -1647,8 +1588,8 @@ def _untag_object(self, pid: str, cid: str) -> None: ) # Remove all files confirmed for deletion self._delete_marked_files(untag_obj_delete_list) - info_msg = f"_untag_object: Untagged pid: {pid} with cid: {cid}" - logging.info(info_msg) + info_msg = f"Untagged pid: {pid} with cid: {cid}" + self.fhs_logger.info(info_msg) except OrphanPidRefsFileFound as oprff: # `find_object` throws this exception when the cid refs file doesn't exist, @@ -1664,11 +1605,10 @@ def _untag_object(self, pid: str, cid: str) -> None: self._delete_marked_files(untag_obj_delete_list) warn_msg = ( - f"_untag_object: Cid refs file does not exist for pid: {pid}." - + " Deleted orphan pid refs file. Additional info: " - + str(oprff) + f"Cid refs file does not exist for pid: {pid}. Deleted orphan pid refs file. " + f"Additional info: {oprff}" ) - logging.warning(warn_msg) + self.fhs_logger.warning(warn_msg) except RefsFileExistsButCidObjMissing as rfebcom: # `find_object` throws this exception when both pid/cid refs files exist but the @@ -1690,11 +1630,11 @@ def _untag_object(self, pid: str, cid: str) -> None: self._delete_marked_files(untag_obj_delete_list) warn_msg = ( - f"_untag_object: data object for cid: {cid_read}. does not exist, but pid and cid " - f"references files found for pid: {pid}, Deleted pid and cid refs files. " - f"Additional info: " + str(rfebcom) + f"data object for cid: {cid_read}. does not exist, but pid and cid references " + + f"files found for pid: {pid}, Deleted pid and cid refs files. " + + f"Additional info: {rfebcom}" ) - logging.warning(warn_msg) + self.fhs_logger.warning(warn_msg) except PidNotFoundInCidRefsFile as pnficrf: # `find_object` throws this exception when both the pid and cid refs file exists @@ -1710,11 +1650,10 @@ def _untag_object(self, pid: str, cid: str) -> None: self._delete_marked_files(untag_obj_delete_list) warn_msg = ( - f"_untag_object: pid not found in expected cid refs file for pid: {pid}. " - + "Deleted orphan pid refs file. Additional info: " - + str(pnficrf) + f"Pid not found in expected cid refs file for pid: {pid}. Deleted orphan pid refs " + f"file. Additional info: {pnficrf}" ) - logging.warning(warn_msg) + self.fhs_logger.warning(warn_msg) except PidRefsDoesNotExist as prdne: # `find_object` throws this exception if the pid refs file is not found @@ -1730,10 +1669,10 @@ def _untag_object(self, pid: str, cid: str) -> None: self._delete_marked_files(untag_obj_delete_list) warn_msg = ( - f"Pid refs file not found, removed pid from cid refs file for cid: {cid}" - + str(prdne) + "Pid refs file not found, removed pid from cid reference file for cid:" + + f" {cid}. Additional info: {prdne}" ) - logging.warning(warn_msg) + self.fhs_logger.warning(warn_msg) def _put_metadata( self, metadata: Union[str, bytes], pid: str, metadata_doc_name: str @@ -1747,9 +1686,7 @@ def _put_metadata( :return: Address of the metadata document. """ - logging.debug( - "FileHashStore - _put_metadata: Request to put metadata for pid: %s", pid - ) + self.fhs_logger.debug("Request to put metadata for pid: %s", pid) # Create metadata tmp file and write to it metadata_stream = Stream(metadata) with closing(metadata_stream): @@ -1768,30 +1705,22 @@ def _put_metadata( parent.mkdir(parents=True, exist_ok=True) # Metadata will be replaced if it exists shutil.move(metadata_tmp, full_path) - logging.debug( - "FileHashStore - _put_metadata: Successfully put metadata for pid: %s", - pid, - ) + self.fhs_logger.debug("Successfully put metadata for pid: %s", pid) return full_path except Exception as err: - err_msg = ( - f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" - ) - logging.error(err_msg) + err_msg = f"Unexpected {err=}, {type(err)=}" + self.fhs_logger.error(err_msg) if os.path.isfile(metadata_tmp): # Remove tmp metadata, calling app must re-upload - logging.debug( - "FileHashStore - _put_metadata: Deleting metadata for pid: %s", - pid, - ) + self.fhs_logger.debug("Deleting metadata for pid: %s", pid) self._delete("metadata", metadata_tmp) raise else: err_msg = ( - f"FileHashStore - _put_metadata: Attempt to move metadata for pid: {pid}" - + f", but metadata temp file not found: {metadata_tmp}" + f"Attempted to move metadata for pid: {pid}, but metadata temp file not found:" + + f" {metadata_tmp}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise FileNotFoundError(err_msg) def _mktmpmetadata(self, stream: "Stream") -> str: @@ -1806,18 +1735,12 @@ def _mktmpmetadata(self, stream: "Stream") -> str: tmp = self._mktmpfile(tmp_root_path) # tmp is a file-like object that is already opened for writing by default - logging.debug( - "FileHashStore - _mktmpmetadata: Writing stream to tmp metadata file: %s", - tmp.name, - ) + self.fhs_logger.debug("Writing stream to tmp metadata file: %s", tmp.name) with tmp as tmp_file: for data in stream: tmp_file.write(self._cast_to_bytes(data)) - logging.debug( - "FileHashStore - _mktmpmetadata: Successfully written to tmp metadata file: %s", - tmp.name, - ) + self.fhs_logger.debug("Successfully written to tmp metadata file: %s", tmp.name) return tmp.name # FileHashStore Utility & Supporting Methods @@ -1836,7 +1759,7 @@ def _delete_marked_files(delete_list: list[str]) -> None: warn_msg = f"Unable to remove {obj} in given delete_list. " + str(e) logging.warning(warn_msg) else: - raise ValueError("delete_marked_files: list cannot be None") + raise ValueError("list cannot be None") def _mark_pid_refs_file_for_deletion( self, pid: str, delete_list: List[str], pid_refs_path: Path @@ -1851,11 +1774,8 @@ def _mark_pid_refs_file_for_deletion( delete_list.append(self._rename_path_for_deletion(pid_refs_path)) except Exception as e: - err_msg = ( - f"Unable to delete pid refs file: {pid_refs_path} for pid: {pid}. " - + str(e) - ) - logging.error(err_msg) + err_msg = f"Unable to delete pid refs file: {pid_refs_path} for pid: {pid}. Details: {e}" + self.fhs_logger.error(err_msg) def _remove_pid_and_handle_cid_refs_deletion( self, pid: str, delete_list: List[str], cid_refs_path: Path @@ -1879,7 +1799,7 @@ def _remove_pid_and_handle_cid_refs_deletion( f"Unable to delete remove pid from cid refs file: {cid_refs_path} for pid:" f" {pid}. " + str(e) ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) def _validate_and_check_cid_lock( self, pid: str, cid: str, cid_to_check: str @@ -1899,6 +1819,7 @@ def _validate_and_check_cid_lock( f"_validate_and_check_cid_lock: cid provided: {cid_to_check} does not " f"match untag request for cid: {cid} and pid: {pid}" ) + self.fhs_logger.error(err_msg) raise ValueError(err_msg) self._check_object_locked_cids(cid) @@ -1914,11 +1835,7 @@ def _write_refs_file(self, path: Path, ref_id: str, ref_type: str) -> str: :return: tmp_file_path - Path to the tmp refs file """ - logging.debug( - "FileHashStore - _write_refs_file: Writing id (%s) into a tmp file in: %s", - ref_id, - path, - ) + self.fhs_logger.debug("Writing id (%s) into a tmp file in: %s", ref_id, path) try: with self._mktmpfile(path) as tmp_file: tmp_file_path = tmp_file.name @@ -1931,10 +1848,10 @@ def _write_refs_file(self, path: Path, ref_id: str, ref_type: str) -> str: except Exception as err: err_msg = ( - "FileHashStore - _write_refs_file: failed to write cid refs file for pid:" - + f" {ref_id} into path: {path}. Unexpected {err=}, {type(err)=}" + f"Failed to write cid refs file for pid: {ref_id} into path: {path}. " + + f"Unexpected error: {err=}, {type(err)=}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise err def _update_refs_file( @@ -1946,17 +1863,14 @@ def _update_refs_file( :param str ref_id: Authority-based or persistent identifier of the object. :param str update_type: 'add' or 'remove' """ - debug_msg = ( - f"FileHashStore - _update_refs_file: Updating ({update_type}) for ref_id: {ref_id}" - + f" at refs file: {refs_file_path}." - ) - logging.debug(debug_msg) + debug_msg = f"Updating ({update_type}) for ref_id: {ref_id} at refs file: {refs_file_path}." + self.fhs_logger.debug(debug_msg) if not os.path.isfile(refs_file_path): err_msg = ( - f"FileHashStore - _update_refs_file: {refs_file_path} does not exist." - + f" Cannot {update_type} ref_id: {ref_id}" + f"Refs file: {refs_file_path} does not exist." + + f"Cannot {update_type} ref_id: {ref_id}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise FileNotFoundError(err_msg) try: if update_type == "add": @@ -1982,16 +1896,16 @@ def _update_refs_file( ref_file.writelines(new_pid_lines) ref_file.truncate() debug_msg = ( - f"FileHashStore - _update_refs_file: Update ({update_type}) for ref_id: {ref_id}" - + f" completed on refs file: {refs_file_path}." + f"Update ({update_type}) for ref_id: {ref_id} " + + f"completed on refs file: {refs_file_path}." ) - logging.debug(debug_msg) + self.fhs_logger.debug(debug_msg) except Exception as err: err_msg = ( - f"FileHashStore - _update_refs_file: failed to {update_type} for ref_id: {ref_id}" + f"Failed to {update_type} for ref_id: {ref_id}" + f" at refs file: {refs_file_path}. Unexpected {err=}, {type(err)=}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise err @staticmethod @@ -2037,20 +1951,18 @@ def _verify_object_information( if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: err_msg = ( - "FileHashStore - _verify_object_information: Object file size calculated: " - + f" {tmp_file_size} does not match with expected size:" - + f" {file_size_to_validate}." + f"Object file size calculated: {tmp_file_size} does not match with expected " + f"size: {file_size_to_validate}." ) if pid is not None: self._delete(entity, tmp_file_name) err_msg_for_pid = ( - err_msg - + f" Tmp file deleted and file not stored for pid: {pid}" + f"{err_msg} Tmp file deleted and file not stored for pid: {pid}" ) - logging.debug(err_msg_for_pid) + self.fhs_logger.debug(err_msg_for_pid) raise NonMatchingObjSize(err_msg_for_pid) else: - logging.debug(err_msg) + self.fhs_logger.debug(err_msg) raise NonMatchingObjSize(err_msg) if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: @@ -2071,21 +1983,19 @@ def _verify_object_information( ) if hex_digest_calculated != checksum: err_msg = ( - "FileHashStore - _verify_object_information: checksum_algorithm" - + f" ({checksum_algorithm}) cannot be found in the default hex digests" - + f" dict, but is supported. New checksum calculated: " - f"{hex_digest_calculated}, does not match what has been provided: " + f"Checksum_algorithm ({checksum_algorithm}) cannot be found in the " + + "default hex digests dict, but is supported. New checksum calculated: " + + f"{hex_digest_calculated}, does not match what has been provided: " + checksum ) - logging.debug(err_msg) + self.fhs_logger.debug(err_msg) raise NonMatchingChecksum(err_msg) else: hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum.lower(): err_msg = ( - "FileHashStore - _verify_object_information: Hex digest and checksum" - + f" do not match - file not stored for pid: {pid}. Algorithm:" - + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + f"Hex digest and checksum do not match - file not stored for pid: {pid}. " + + f"Algorithm: {checksum_algorithm}. Checksum provided: {checksum} !=" + f" HexDigest: {hex_digest_stored}." ) if pid is not None: @@ -2094,10 +2004,10 @@ def _verify_object_information( err_msg_for_pid = ( err_msg + f" Tmp file ({tmp_file_name}) deleted." ) - logging.debug(err_msg_for_pid) + self.fhs_logger.error(err_msg_for_pid) raise NonMatchingChecksum(err_msg_for_pid) else: - logging.debug(err_msg) + self.fhs_logger.error(err_msg) raise NonMatchingChecksum(err_msg) def _verify_hashstore_references( @@ -2117,11 +2027,8 @@ def _verify_hashstore_references( :param path cid_refs_path: Path to cid refs file :param str additional_log_string: String to append to exception statement """ - debug_msg = ( - f"FileHashStore - _verify_hashstore_references: verifying pid ({pid})" - + f" and cid ({cid}) refs files. Additional Note: {additional_log_string}" - ) - logging.debug(debug_msg) + debug_msg = f"Verifying pid ({pid}) and cid ({cid}) refs files. Note: {additional_log_string}" + self.fhs_logger.debug(debug_msg) if pid_refs_path is None: pid_refs_path = self._get_hashstore_pid_refs_path(pid) if cid_refs_path is None: @@ -2129,41 +2036,33 @@ def _verify_hashstore_references( # Check that reference files were created if not os.path.isfile(pid_refs_path): - err_msg = ( - "FileHashStore - _verify_hashstore_references: Pid refs file missing: " - + str(pid_refs_path) - + f" . Additional Context: {additional_log_string}" - ) - logging.error(err_msg) + err_msg = f" Pid refs file missing: {pid_refs_path}. Note: {additional_log_string}" + self.fhs_logger.error(err_msg) raise PidRefsFileNotFound(err_msg) if not os.path.isfile(cid_refs_path): err_msg = ( - "FileHashStore - _verify_hashstore_references: Cid refs file missing: " - + str(cid_refs_path) - + f" . Additional Context: {additional_log_string}" + f"Cid refs file missing: {cid_refs_path}. Note: {additional_log_string}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise CidRefsFileNotFound(err_msg) # Check the content of the reference files # Start with the cid retrieved_cid = self._read_small_file_content(pid_refs_path) if retrieved_cid != cid: err_msg = ( - "FileHashStore - _verify_hashstore_references: Pid refs file exists" - + f" ({pid_refs_path}) but cid ({cid}) does not match." - + f" Additional Context: {additional_log_string}" + f"Pid refs file exists ({pid_refs_path}) but cid ({cid}) does not match." + + f" Note: {additional_log_string}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise PidRefsContentError(err_msg) # Then the pid pid_found = self._is_string_in_refs_file(pid, cid_refs_path) if not pid_found: err_msg = ( - "FileHashStore - _verify_hashstore_references: Cid refs file exists" - + f" ({cid_refs_path}) but pid ({pid}) not found." - + f" Additional Context: {additional_log_string}" + f"Cid refs file exists ({cid_refs_path}) but pid ({pid}) not found." + + f" Note: {additional_log_string}" ) - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise CidRefsContentError(err_msg) def _delete_object_only(self, cid: str) -> None: @@ -2185,17 +2084,17 @@ def _delete_object_only(self, cid: str) -> None: with self.object_cid_condition_mp: # Wait for the cid to release if it's in use while cid in self.object_locked_cids_mp: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.object_cid_condition_mp.wait() # Modify reference_locked_cids consecutively - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.object_locked_cids_mp.append(cid) else: with self.object_cid_condition_th: while cid in self.object_locked_cids_th: - logging.debug(sync_wait_msg) + self.fhs_logger.debug(sync_wait_msg) self.object_cid_condition_th.wait() - logging.debug(sync_begin_debug_msg) + self.fhs_logger.debug(sync_begin_debug_msg) self.object_locked_cids_th.append(cid) try: @@ -2208,12 +2107,12 @@ def _delete_object_only(self, cid: str) -> None: ) if self.use_multiprocessing: with self.object_cid_condition_mp: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.object_locked_cids_mp.remove(cid) self.object_cid_condition_mp.notify() else: with self.object_cid_condition_th: - logging.debug(end_sync_debug_msg) + self.fhs_logger.debug(end_sync_debug_msg) self.object_locked_cids_th.remove(cid) self.object_cid_condition_th.notify() From d6c3c694194a36ae4429368db53ce188ab9e0875 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 27 Sep 2024 09:22:16 -0700 Subject: [PATCH 411/420] Refactor '_delete_object_only', add missing logging statements and fix potential dead lock due to sync being outside try block --- src/hashstore/filehashstore.py | 57 +++++++++------------------------- 1 file changed, 15 insertions(+), 42 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 14fb9cb5..7cda82c1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2071,50 +2071,23 @@ def _delete_object_only(self, cid: str) -> None: :param str cid: Content identifier """ - cid_refs_abs_path = self._get_hashstore_cid_refs_path(cid) - # If the refs file still exists, do not delete the object - if not os.path.isfile(cid_refs_abs_path): - sync_begin_debug_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) to locked list." - ) - sync_wait_msg = ( - f"FileHashStore - delete_object: Cid ({cid}) is locked. Waiting." - ) - if self.use_multiprocessing: - with self.object_cid_condition_mp: - # Wait for the cid to release if it's in use - while cid in self.object_locked_cids_mp: - self.fhs_logger.debug(sync_wait_msg) - self.object_cid_condition_mp.wait() - # Modify reference_locked_cids consecutively - self.fhs_logger.debug(sync_begin_debug_msg) - self.object_locked_cids_mp.append(cid) - else: - with self.object_cid_condition_th: - while cid in self.object_locked_cids_th: - self.fhs_logger.debug(sync_wait_msg) - self.object_cid_condition_th.wait() - self.fhs_logger.debug(sync_begin_debug_msg) - self.object_locked_cids_th.append(cid) + try: + cid_refs_abs_path = self._get_hashstore_cid_refs_path(cid) + # If the refs file still exists, do not delete the object + self._synchronize_object_locked_cids(cid) + if os.path.isfile(cid_refs_abs_path): + debug_msg = ( + f"Cid reference file exists for: {cid}, skipping delete request." + ) + self.fhs_logger.debug(debug_msg) - try: + else: self._delete("objects", cid) - finally: - # Release cid - end_sync_debug_msg = ( - f"FileHashStore - delete_object: Releasing cid ({cid})" - + " from locked list" - ) - if self.use_multiprocessing: - with self.object_cid_condition_mp: - self.fhs_logger.debug(end_sync_debug_msg) - self.object_locked_cids_mp.remove(cid) - self.object_cid_condition_mp.notify() - else: - with self.object_cid_condition_th: - self.fhs_logger.debug(end_sync_debug_msg) - self.object_locked_cids_th.remove(cid) - self.object_cid_condition_th.notify() + info_msg = f"Deleted object only for cid: {cid}" + self.fhs_logger.info(info_msg) + + finally: + self._release_object_locked_cids(cid) def _check_arg_algorithms_and_checksum( self, From dbd57de2b019749f9fb09a98b6fa763337c2ae62 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 27 Sep 2024 10:24:24 -0700 Subject: [PATCH 412/420] Clean-up logging in 'filehashstore' remaining supporting and core methods, and optimize sync method logging calls, add missing logging statements --- src/hashstore/filehashstore.py | 139 ++++++++++++--------------------- 1 file changed, 50 insertions(+), 89 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7cda82c1..380e521a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2132,7 +2132,7 @@ def _check_arg_format_id(self, format_id: str, method: str) -> str: """ if format_id and not format_id.strip(): err_msg = f"FileHashStore - {method}: Format_id cannot be empty." - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise ValueError(err_msg) elif format_id is None: # Use default value set by hashstore config @@ -2156,19 +2156,19 @@ def _refine_algorithm_list( self._clean_algorithm(checksum_algorithm) if checksum_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( - f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." + f"Checksum algo: {checksum_algorithm} found in other_algo_lists, adding to " + + f"list of algorithms to calculate." ) - logging.debug(debug_additional_other_algo_str) + self.fhs_logger.debug(debug_additional_other_algo_str) algorithm_list_to_calculate.append(checksum_algorithm) if additional_algorithm is not None: self._clean_algorithm(additional_algorithm) if additional_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( - f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." + f"Additional algo: {additional_algorithm} found in other_algo_lists, " + + f"adding to list of algorithms to calculate." ) - logging.debug(debug_additional_other_algo_str) + self.fhs_logger.debug(debug_additional_other_algo_str) algorithm_list_to_calculate.append(additional_algorithm) # Remove duplicates @@ -2196,11 +2196,8 @@ def _clean_algorithm(self, algorithm_string: str) -> str: cleaned_string not in self.default_algo_list and cleaned_string not in self.other_algo_list ): - err_msg = ( - "FileHashStore: _clean_algorithm: Algorithm not supported:" - + cleaned_string - ) - logging.error(err_msg) + err_msg = f"Algorithm not supported: {cleaned_string}" + self.fhs_logger.error(err_msg) raise UnsupportedAlgorithm(err_msg) return cleaned_string @@ -2365,7 +2362,7 @@ def _delete(self, entity: str, file: Union[str, Path]) -> None: except Exception as err: err_msg = f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" - logging.error(err_msg) + self.fhs_logger.error(err_msg) raise err def _create_path(self, path: Path) -> None: @@ -2437,9 +2434,8 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path: str) -> Path: return Path(relpath) else: raise FileNotFoundError( - "FileHashStore - _get_hashstore_data_object_path: could not locate a" - + "data object in '/objects' for the supplied cid_or_relative_path: " - + cid_or_relative_path + "Could not locate a data object in '/objects' for the supplied " + + f"cid_or_relative_path: {cid_or_relative_path}" ) def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: @@ -2460,9 +2456,8 @@ def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: return Path(metadata_relative_path) else: raise FileNotFoundError( - "FileHashStore - _get_hashstore_metadata_path: could not locate a" - + "metadata object in '/metadata' for the supplied metadata_relative_path: " - + str(metadata_relative_path) + "Could not locate a metadata object in '/metadata' for the supplied " + + f"metadata_relative_path: {metadata_relative_path}" ) def _get_hashstore_pid_refs_path(self, pid: str) -> Path: @@ -2504,27 +2499,17 @@ def _synchronize_object_locked_pids(self, pid: str) -> None: with self.object_pid_condition_mp: # Wait for the cid to release if it's being tagged while pid in self.object_locked_pids_mp: - logging.debug( - f"_synchronize_object_locked_pids: Pid ({pid}) is locked. Waiting." - ) + self.fhs_logger.debug(f"Pid ({pid}) is locked. Waiting.") self.object_pid_condition_mp.wait() self.object_locked_pids_mp.append(pid) - logging.debug( - f"_synchronize_object_locked_pids: Synchronizing object_locked_pids_mp for" - + f" pid: {pid}" - ) + self.fhs_logger.debug(f"Synchronizing object_locked_pids_mp for pid: {pid}") else: with self.object_pid_condition_th: while pid in self.object_locked_pids_th: - logging.debug( - f"_synchronize_object_locked_pids: Pid ({pid}) is locked. Waiting." - ) + self.fhs_logger.debug(f"Pid ({pid}) is locked. Waiting.") self.object_pid_condition_th.wait() self.object_locked_pids_th.append(pid) - logging.debug( - f"_synchronize_object_locked_pids: Synchronizing object_locked_pids_th for" - + f" pid: {pid}" - ) + self.fhs_logger.debug(f"Synchronizing object_locked_pids_th for pid: {pid}") def _release_object_locked_pids(self, pid: str) -> None: """Remove the given persistent identifier from 'object_locked_pids' and notify other @@ -2536,11 +2521,15 @@ def _release_object_locked_pids(self, pid: str) -> None: with self.object_pid_condition_mp: self.object_locked_pids_mp.remove(pid) self.object_pid_condition_mp.notify() + end_sync_debug_msg = f"Releasing pid ({pid}) from object_locked_pids_mp." + self.fhs_logger.debug(end_sync_debug_msg) else: # Release pid with self.object_pid_condition_th: self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() + end_sync_debug_msg = f"Releasing pid ({pid}) from object_locked_pids_th." + self.fhs_logger.debug(end_sync_debug_msg) def _synchronize_object_locked_cids(self, cid: str) -> None: """Multiple threads may access a data object via its 'cid' or the respective 'cid @@ -2553,28 +2542,18 @@ def _synchronize_object_locked_cids(self, cid: str) -> None: with self.object_cid_condition_mp: # Wait for the cid to release if it's being tagged while cid in self.object_locked_cids_mp: - logging.debug( - f"synchronize_referenced_locked_cids: Cid ({cid}) is locked. Waiting." - ) + self.fhs_logger.debug(f"Cid ({cid}) is locked. Waiting.") self.object_cid_condition_mp.wait() # Modify reference_locked_cids consecutively self.object_locked_cids_mp.append(cid) - logging.debug( - f"synchronize_referenced_locked_cids: Synchronizing object_locked_cids_mp for" - + f" cid: {cid}" - ) + self.fhs_logger.debug(f"Synchronizing object_locked_cids_mp for cid: {cid}") else: with self.object_cid_condition_th: while cid in self.object_locked_cids_th: - logging.debug( - f"synchronize_referenced_locked_cids: Cid ({cid}) is locked. Waiting." - ) + self.fhs_logger.debug(f"Cid ({cid}) is locked. Waiting.") self.object_cid_condition_th.wait() self.object_locked_cids_th.append(cid) - logging.debug( - f"synchronize_referenced_locked_cids: Synchronizing object_locked_cids_th for" - + f" cid: {cid}" - ) + self.fhs_logger.debug(f"Synchronizing object_locked_cids_th for cid: {cid}") def _check_object_locked_cids(self, cid: str) -> None: """Check that a given content identifier is currently locked (found in the @@ -2584,13 +2563,13 @@ def _check_object_locked_cids(self, cid: str) -> None: """ if self.use_multiprocessing: if cid not in self.object_locked_cids_mp: - err_msg = f"_check_object_locked_cids: cid {cid} is not locked." - logging.error(err_msg) + err_msg = f"Cid {cid} is not locked." + self.fhs_logger.error(err_msg) raise IdentifierNotLocked(err_msg) else: if cid not in self.object_locked_cids_th: - err_msg = f"_check_object_locked_cids: cid {cid} is not locked." - logging.error(err_msg) + err_msg = f"Cid {cid} is not locked." + self.fhs_logger.error(err_msg) raise IdentifierNotLocked(err_msg) def _release_object_locked_cids(self, cid: str) -> None: @@ -2603,20 +2582,14 @@ def _release_object_locked_cids(self, cid: str) -> None: with self.object_cid_condition_mp: self.object_locked_cids_mp.remove(cid) self.object_cid_condition_mp.notify() - end_sync_debug_msg = ( - f"FileHashStore - _release_object_locked_cids: Releasing cid ({cid}) from" - + " object_cid_condition_mp." - ) - logging.debug(end_sync_debug_msg) + end_sync_debug_msg = f"Releasing cid ({cid}) from object_cid_condition_mp." + self.fhs_logger.debug(end_sync_debug_msg) else: with self.object_cid_condition_th: self.object_locked_cids_th.remove(cid) self.object_cid_condition_th.notify() - end_sync_debug_msg = ( - f"FileHashStore - _release_object_locked_cids: Releasing cid ({cid}) from" - + " object_cid_condition_th." - ) - logging.debug(end_sync_debug_msg) + end_sync_debug_msg = f"Releasing cid ({cid}) from object_cid_condition_th." + self.fhs_logger.debug(end_sync_debug_msg) def _synchronize_referenced_locked_pids(self, pid: str) -> None: """Multiple threads may interact with a pid (to tag, untag, delete) and these actions @@ -2628,28 +2601,22 @@ def _synchronize_referenced_locked_pids(self, pid: str) -> None: with self.reference_pid_condition_mp: # Wait for the pid to release if it's in use while pid in self.reference_locked_pids_mp: - logging.debug( - f"_synchronize_referenced_locked_pids: Pid ({pid}) is locked. Waiting." - ) + self.fhs_logger.debug(f"Pid ({pid}) is locked. Waiting.") self.reference_pid_condition_mp.wait() # Modify reference_locked_pids consecutively self.reference_locked_pids_mp.append(pid) - logging.debug( - f"_synchronize_referenced_locked_pids: Synchronizing reference_locked_pids_mp" - + f" for pid: {pid}" - ) + self.fhs_logger.debug( + f"Synchronizing reference_locked_pids_mp for pid: {pid}" + ) else: with self.reference_pid_condition_th: while pid in self.reference_locked_pids_th: - logging.debug( - f"_synchronize_referenced_locked_pids: Pid ({pid}) is locked. Waiting." - ) + logging.debug(f"Pid ({pid}) is locked. Waiting.") self.reference_pid_condition_th.wait() self.reference_locked_pids_th.append(pid) - logging.debug( - f"_synchronize_referenced_locked_pids: Synchronizing reference_locked_pids_th" - + f" for pid: {pid}" - ) + self.fhs_logger.debug( + f"Synchronizing reference_locked_pids_th for pid: {pid}" + ) def _check_reference_locked_pids(self, pid: str) -> None: """Check that a given persistent identifier is currently locked (found in the @@ -2659,13 +2626,13 @@ def _check_reference_locked_pids(self, pid: str) -> None: """ if self.use_multiprocessing: if pid not in self.reference_locked_pids_mp: - err_msg = f"_check_reference_locked_pids: pid {pid} is not locked." - logging.error(err_msg) + err_msg = f"Pid {pid} is not locked." + self.fhs_logger.error(err_msg) raise IdentifierNotLocked(err_msg) else: if pid not in self.reference_locked_pids_th: - err_msg = f"_check_reference_locked_pids: pid {pid} is not locked." - logging.error(err_msg) + err_msg = f"Pid {pid} is not locked." + self.fhs_logger.error(err_msg) raise IdentifierNotLocked(err_msg) def _release_reference_locked_pids(self, pid: str) -> None: @@ -2678,21 +2645,15 @@ def _release_reference_locked_pids(self, pid: str) -> None: with self.reference_pid_condition_mp: self.reference_locked_pids_mp.remove(pid) self.reference_pid_condition_mp.notify() - end_sync_debug_msg = ( - f"FileHashStore - _release_reference_locked_pids: Releasing pid ({pid}) from" - + " reference_locked_pids_mp." - ) - logging.debug(end_sync_debug_msg) + end_sync_debug_msg = f"Releasing pid ({pid}) from reference_locked_pids_mp." + self.fhs_logger.debug(end_sync_debug_msg) else: # Release pid with self.reference_pid_condition_th: self.reference_locked_pids_th.remove(pid) self.reference_pid_condition_th.notify() - end_sync_debug_msg = ( - f"FileHashStore - _release_reference_locked_pids: Releasing pid ({pid}) from" - + " reference_locked_pids_th." - ) - logging.debug(end_sync_debug_msg) + end_sync_debug_msg = f"Releasing pid ({pid}) from reference_locked_pids_th." + self.fhs_logger.debug(end_sync_debug_msg) # Other Static Methods @staticmethod From 6256bcb29d94f2b72a839864b70848537e75d405 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 27 Sep 2024 11:23:19 -0700 Subject: [PATCH 413/420] Refactor and fix bug in 'store_object' where sync method was not within try block which could lead to dead lock --- src/hashstore/filehashstore.py | 96 +++++++++++++++++----------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 380e521a..3ae0f3f1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -536,48 +536,48 @@ def store_object( additional_algorithm, checksum, checksum_algorithm ) - sync_begin_debug_msg = f"Adding pid ({pid}) to locked list." - err_msg = f"Duplicate object request encountered for pid: {pid}. Already in progress." - if self.use_multiprocessing: - with self.object_pid_condition_mp: - # Wait for the pid to release if it's in use - if pid in self.object_locked_pids_mp: - self.fhs_logger.error(err_msg) - raise StoreObjectForPidAlreadyInProgress(err_msg) - # Modify object_locked_pids consecutively - self.fhs_logger.debug(sync_begin_debug_msg) - self.object_locked_pids_mp.append(pid) - else: - with self.object_pid_condition_th: - if pid in self.object_locked_pids_th: - logging.error(err_msg) - raise StoreObjectForPidAlreadyInProgress(err_msg) - self.fhs_logger.debug(sync_begin_debug_msg) - self.object_locked_pids_th.append(pid) try: - self.fhs_logger.debug("Attempting to store object for pid: %s", pid) - object_metadata = self._store_and_validate_data( - pid, - data, - additional_algorithm=additional_algorithm_checked, - checksum=checksum, - checksum_algorithm=checksum_algorithm_checked, - file_size_to_validate=expected_object_size, + err_msg = ( + f"Duplicate object request for pid: {pid}. Already in progress." ) - self.fhs_logger.debug("Attempting to tag object for pid: %s", pid) - cid = object_metadata.cid - self.tag_object(pid, cid) - self.fhs_logger.info("Successfully stored object for pid: %s", pid) + if self.use_multiprocessing: + with self.object_pid_condition_mp: + # Raise exception immediately if pid is in use + if pid in self.object_locked_pids_mp: + self.fhs_logger.error(err_msg) + raise StoreObjectForPidAlreadyInProgress(err_msg) + else: + with self.object_pid_condition_th: + if pid in self.object_locked_pids_th: + logging.error(err_msg) + raise StoreObjectForPidAlreadyInProgress(err_msg) + + try: + self._synchronize_object_locked_pids(pid) + + self.fhs_logger.debug("Attempting to store object for pid: %s", pid) + object_metadata = self._store_and_validate_data( + pid, + data, + additional_algorithm=additional_algorithm_checked, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + file_size_to_validate=expected_object_size, + ) + self.fhs_logger.debug("Attempting to tag object for pid: %s", pid) + cid = object_metadata.cid + self.tag_object(pid, cid) + self.fhs_logger.info("Successfully stored object for pid: %s", pid) + finally: + # Release pid + self._release_object_locked_pids(pid) except Exception as err: err_msg = ( - f"failed to store object for pid: {pid}. Reference files will not be created " - f"or tagged. Unexpected error: {err})" + f"Failed to store object for pid: {pid}. Reference files will not be " + f"created or tagged. Unexpected error: {err})" ) self.fhs_logger.error(err_msg) raise err - finally: - # Release pid - self._release_object_locked_pids(pid) return object_metadata @@ -2521,15 +2521,13 @@ def _release_object_locked_pids(self, pid: str) -> None: with self.object_pid_condition_mp: self.object_locked_pids_mp.remove(pid) self.object_pid_condition_mp.notify() - end_sync_debug_msg = f"Releasing pid ({pid}) from object_locked_pids_mp." - self.fhs_logger.debug(end_sync_debug_msg) + self.fhs_logger.debug(f"Releasing pid ({pid}) from object_locked_pids_mp.") else: # Release pid with self.object_pid_condition_th: self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() - end_sync_debug_msg = f"Releasing pid ({pid}) from object_locked_pids_th." - self.fhs_logger.debug(end_sync_debug_msg) + self.fhs_logger.debug(f"Releasing pid ({pid}) from object_locked_pids_th.") def _synchronize_object_locked_cids(self, cid: str) -> None: """Multiple threads may access a data object via its 'cid' or the respective 'cid @@ -2582,14 +2580,16 @@ def _release_object_locked_cids(self, cid: str) -> None: with self.object_cid_condition_mp: self.object_locked_cids_mp.remove(cid) self.object_cid_condition_mp.notify() - end_sync_debug_msg = f"Releasing cid ({cid}) from object_cid_condition_mp." - self.fhs_logger.debug(end_sync_debug_msg) + self.fhs_logger.debug( + f"Releasing cid ({cid}) from object_cid_condition_mp." + ) else: with self.object_cid_condition_th: self.object_locked_cids_th.remove(cid) self.object_cid_condition_th.notify() - end_sync_debug_msg = f"Releasing cid ({cid}) from object_cid_condition_th." - self.fhs_logger.debug(end_sync_debug_msg) + self.fhs_logger.debug( + f"Releasing cid ({cid}) from object_cid_condition_th." + ) def _synchronize_referenced_locked_pids(self, pid: str) -> None: """Multiple threads may interact with a pid (to tag, untag, delete) and these actions @@ -2645,15 +2645,17 @@ def _release_reference_locked_pids(self, pid: str) -> None: with self.reference_pid_condition_mp: self.reference_locked_pids_mp.remove(pid) self.reference_pid_condition_mp.notify() - end_sync_debug_msg = f"Releasing pid ({pid}) from reference_locked_pids_mp." - self.fhs_logger.debug(end_sync_debug_msg) + self.fhs_logger.debug( + f"Releasing pid ({pid}) from reference_locked_pids_mp." + ) else: # Release pid with self.reference_pid_condition_th: self.reference_locked_pids_th.remove(pid) self.reference_pid_condition_th.notify() - end_sync_debug_msg = f"Releasing pid ({pid}) from reference_locked_pids_th." - self.fhs_logger.debug(end_sync_debug_msg) + self.fhs_logger.debug( + f"Releasing pid ({pid}) from reference_locked_pids_th." + ) # Other Static Methods @staticmethod From fe5e7dbe60cfbf024b9fbb2c677f27e4c48f16d8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 1 Oct 2024 10:20:12 -0700 Subject: [PATCH 414/420] Clean-up code/resolve linting errors for missed unused variables and for long lines --- src/hashstore/filehashstore.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3ae0f3f1..74b9c600 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -759,8 +759,6 @@ def delete_object(self, pid: str) -> None: # Storing and deleting objects are synchronized together # Duplicate store object requests for a pid are rejected, but deleting an object # will wait for a pid to be released if it's found to be in use before proceeding. - sync_begin_debug_msg = f"Pid ({pid}) to locked list." - sync_wait_msg = f"Pid ({pid}) is locked. Waiting." try: # Before we begin deletion process, we look for the `cid` by calling @@ -983,7 +981,10 @@ def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: try: full_path_without_directory = Path(self.metadata / rel_path / pid_doc) self._delete("metadata", full_path_without_directory) - info_string = f"Successfully deleted metadata for pid: {pid} for format_id: {format_id}" + info_string = ( + f"Deleted metadata for pid: {pid} for format_id: {format_id}" + ) + self.fhs_logger.info(info_string) finally: # Release pid @@ -1774,7 +1775,9 @@ def _mark_pid_refs_file_for_deletion( delete_list.append(self._rename_path_for_deletion(pid_refs_path)) except Exception as e: - err_msg = f"Unable to delete pid refs file: {pid_refs_path} for pid: {pid}. Details: {e}" + err_msg = ( + f"Unable to delete pid refs file: {pid_refs_path} for pid: {pid}. {e}" + ) self.fhs_logger.error(err_msg) def _remove_pid_and_handle_cid_refs_deletion( @@ -2027,7 +2030,9 @@ def _verify_hashstore_references( :param path cid_refs_path: Path to cid refs file :param str additional_log_string: String to append to exception statement """ - debug_msg = f"Verifying pid ({pid}) and cid ({cid}) refs files. Note: {additional_log_string}" + debug_msg = ( + f"Verifying pid ({pid}) and cid ({cid}) refs files. {additional_log_string}" + ) self.fhs_logger.debug(debug_msg) if pid_refs_path is None: pid_refs_path = self._get_hashstore_pid_refs_path(pid) From 070c33856daa9d33b00e5119c83c083b2b3ea613 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 1 Oct 2024 10:29:20 -0700 Subject: [PATCH 415/420] Update README.md with new citation section and placeholders for DOI information --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index eeff4e9d..1667a497 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ ## HashStore: hash-based object storage for DataONE data packages +Version: 1.1.0 +DOI: DOI_ID_PLACEHOLDER + +## Contributors + - **Author**: Dou Mok, Matthew Brooke, Jing Tao, Jeanette Clarke, Ian Nesbitt, Matthew B. Jones - **License**: [Apache 2](http://opensource.org/licenses/Apache-2.0) - [Package source code on GitHub](https://github.com/DataONEorg/hashstore) @@ -7,6 +12,16 @@ - Contact us: support@dataone.org - [DataONE discussions](https://github.com/DataONEorg/dataone/discussions) +## Citation + +Cite this software as: + +> Dou Mok, Matthew Brooke, Jing Tao, Jeanette Clarke, Ian Nesbitt, Matthew B. Jones. 2024. +> HashStore: hash-based object storage for DataONE data packages. Arctic Data Center. +> [doi-id-plchldr](doi-url-plchldr) + +## Introduction + HashStore is a server-side python package that implements an object storage file system for storing and accessing data and metadata for DataONE services. The package is used in DataONE system components that need direct, filesystem-based access to data objects, their system metadata, and From cf02c902fdde039b740eb3f2f1d4e5ab9cf460ca Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 1 Oct 2024 12:15:08 -0700 Subject: [PATCH 416/420] Update 'README.md' with DOI --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1667a497..d840e664 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ## HashStore: hash-based object storage for DataONE data packages Version: 1.1.0 -DOI: DOI_ID_PLACEHOLDER +DOI: [doi:10.18739/A2ZG6G87Q](https://doi.org/10.18739/A2ZG6G87Q) ## Contributors @@ -18,7 +18,7 @@ Cite this software as: > Dou Mok, Matthew Brooke, Jing Tao, Jeanette Clarke, Ian Nesbitt, Matthew B. Jones. 2024. > HashStore: hash-based object storage for DataONE data packages. Arctic Data Center. -> [doi-id-plchldr](doi-url-plchldr) +> [doi:10.18739/A2ZG6G87Q](https://doi.org/10.18739/A2ZG6G87Q) ## Introduction From 6552748e8160ef384fbb92c76f4d54923fc3ea66 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 1 Oct 2024 16:25:48 -0700 Subject: [PATCH 417/420] Update PUBLIC API section in 'README.md' --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d840e664..f23bf7a0 100644 --- a/README.md +++ b/README.md @@ -55,12 +55,12 @@ the expected usage of HashStore. ### Public API Methods - store_object -- verify_object - tag_object - store_metadata - retrieve_object - retrieve_metadata - delete_object +- delete_if_invalid_object - delete_metadata - get_hex_digest From 5169cd4c3697ca21faf582ec587813a92f331f87 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 1 Oct 2024 16:50:35 -0700 Subject: [PATCH 418/420] Resolve linting warning due to conflicting variable name 'store' in hashstore test module --- tests/test_hashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 34ba6e15..02d83e3b 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -23,8 +23,8 @@ def test_factory_get_hashstore_filehashstore(factory, props): module_name = "hashstore.filehashstore" class_name = "FileHashStore" # These props can be found in tests/conftest.py - store = factory.get_hashstore(module_name, class_name, props) - assert isinstance(store, FileHashStore) + this_store = factory.get_hashstore(module_name, class_name, props) + assert isinstance(this_store, FileHashStore) def test_factory_get_hashstore_unsupported_class(factory): From c9e911c4fb92ea7d2bdd18e1d21188be11c34fa4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 1 Oct 2024 17:11:44 -0700 Subject: [PATCH 419/420] Review and update 'README.md' --- README.md | 93 +++++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index f23bf7a0..2ad5cc66 100644 --- a/README.md +++ b/README.md @@ -22,14 +22,13 @@ Cite this software as: ## Introduction -HashStore is a server-side python package that implements an object storage file system for storing -and accessing data and metadata for DataONE services. The package is used in DataONE system -components that need direct, filesystem-based access to data objects, their system metadata, and -extended metadata about the objects. This package is a core component of -the [DataONE federation](https://dataone.org), and supports large-scale object storage for a variety -of repositories, including the [KNB Data Repository](http://knb.ecoinformatics.org), -the [NSF Arctic Data Center](https://arcticdata.io/catalog/), -the [DataONE search service](https://search.dataone.org), and other repositories. +HashStore is a server-side python package that implements a hash-based object storage file system +for storing and accessing data and metadata for DataONE services. The package is used in DataONE +system components that need direct, filesystem-based access to data objects, their system +metadata, and extended metadata about the objects. This package is a core component of the +[DataONE federation](https://dataone.org), and supports large-scale object storage for a variety +of repositories, including the [KNB Data Repository](http://knb.ecoinformatics.org), the [NSF +Arctic Data Center](https://arcticdata.io/catalog/), the [DataONE search service](https://search.dataone.org), and other repositories. DataONE in general, and HashStore in particular, are open source, community projects. We [welcome contributions](https://github.com/DataONEorg/hashstore/blob/main/CONTRIBUTING.md) in @@ -39,18 +38,18 @@ contributions with us. ## Documentation -Documentation is a work in progress, and can be found on -the [Metacat repository](https://github.com/NCEAS/metacat/blob/feature-1436-storage-and-indexing/docs/user/metacat/source/storage-subsystem.rst#physical-file-layout) -as part of the storage redesign planning. Future updates will include documentation here as the +The documentation around HashStore's initial design phase can be found here in the [Metacat +repository](https://github.com/NCEAS/metacat/blob/feature-1436-storage-and-indexing/docs/user/metacat/source/storage-subsystem.rst#physical-file-layout) +as part of the storage re-design planning. Future updates will include documentation here as the package matures. ## HashStore Overview -HashStore is an object storage system that provides persistent file-based storage using content -hashes to de-duplicate data. The system stores both objects, references (refs) and metadata in its -respective directories and utilizes an identifier-based API for interacting with the store. -HashStore storage classes (like `filehashstore`) must implement the HashStore interface to ensure -the expected usage of HashStore. +HashStore is a hash-based object storage system that provides persistent file-based storage using +content hashes to de-duplicate data. The system stores data objects, references (refs) and +metadata in its respective directories and utilizes an identifier-based API for interacting +with the store. HashStore storage classes (like `filehashstore`) must implement the HashStore +interface to ensure the consistent and expected usage of HashStore. ### Public API Methods @@ -161,12 +160,12 @@ metadata_cid_two = hashstore.store_metadata(pid, metadata, format_id) ### Working with objects (store, retrieve, delete) -In HashStore, objects are first saved as temporary files while their content identifiers are +In HashStore, data objects begin as temporary files while their content identifiers are calculated. Once the default hash algorithm list and their hashes are generated, objects are stored -in their permanent location using the store's algorithm's corresponding hash value, the store depth -and the store width. Lastly, objects are 'tagged' with a given identifier (ex. persistent -identifier (pid)). This process produces reference files, which allow objects to be found and -retrieved with a given identifier. +in their permanent locations using the hash value of the store's configured algorithm, and +then divided accordingly based on the configured width and depth. Lastly, objects are 'tagged' +with a given identifier (ex. persistent identifier (pid)). This process produces reference +files, which allow objects to be found and retrieved with a given identifier. - Note 1: An identifier can only be used once - Note 2: Each object is stored once and only once using its content identifier (a checksum @@ -177,11 +176,10 @@ retrieved with a given identifier. By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an -object. The client is then expected to call `verify_object` when the relevant metadata is available -to confirm that the object has been stored as expected. The client is then expected to call -`delete_if_invalid_object` when the relevant metadata is available to confirm that the object is -what is expected. And to finalize the process (to make the object discoverable), the client -calls `tagObject``. In summary, there are two expected paths to store an object: +object. The client is then expected to call `delete_if_invalid_object` when the relevant +metadata is available to confirm that the object is what is expected. And to finalize the data-only +storage process (to make the object discoverable), the client calls `tagObject``. In summary, there +are two expected paths to store an object: ```py import io @@ -212,7 +210,7 @@ path = "/path/to/dou.test.1" input_stream = io.open(path, "rb") pid = "dou.test.1" # All-in-one process which stores, validates and tags an object -obj_info_allinone = hashstore.store_object(input_stream, pid, additional_algo, checksum, +obj_info_all_in_one = hashstore.store_object(input_stream, pid, additional_algo, checksum, checksum_algo, obj_size) # Manual Process @@ -233,11 +231,9 @@ hashstore.tag_object(pid, obj_info_manual.cid) - To delete an object and all its associated reference files, call the Public API method `delete_object`. -- Note, `delete_object` and `store_object` are synchronized based on a given 'pid'. An object that - is in the process of being stored based on a pid should not be deleted at the same time. +- Note, `delete_object` and `store_object` are synchronized processes based on a given `pid`. Additionally, `delete_object` further synchronizes with `tag_object` based on a `cid`. Every - object is stored once, is unique and shares one cid reference file. The API calls to access this - cid reference file must be coordinated to prevent file system locking exceptions. + object is stored once, is unique and shares one cid reference file. ###### Working with metadata (store, retrieve, delete) @@ -267,17 +263,17 @@ ex. `store_metadata(stream, pid, format_id)`). ### What are HashStore reference files? -HashStore assumes that every object to store has a respective identifier. This identifier is then -used when storing, retrieving and deleting an object. In order to facilitate this process, we create -two types of reference files: +HashStore assumes that every data object is referenced by its a respective identifier. This +identifier is then used when storing, retrieving and deleting an object. In order to facilitate +this process, we create two types of reference files: - pid (persistent identifier) reference files - cid (content identifier) reference files These reference files are implemented in HashStore underneath the hood with no expectation for -modification from the calling app/client. The one and only exception to this process when the -calling client/app does not have an identifier, and solely stores an objects raw bytes in -HashStore (calling `store_object(stream)`). +modification from the calling app/client. The one and only exception to this process is when the +calling client/app does not have an identifier available (i.e. they receive the stream to store +the data object first without any metadata, thus calling `store_object(stream)`). **'pid' Reference Files** @@ -286,10 +282,10 @@ HashStore (calling `store_object(stream)`). - If an identifier is not available at the time of storing an object, the calling app/client must create this association between a pid and the object it represents by calling `tag_object` separately. -- Each pid reference file contains a string that represents the content identifier of the object it - references +- Each pid reference file contains a single string that represents the content identifier of the + object it references - Like how objects are stored once and only once, there is also only one pid reference file for each - object. + data object. **'cid' Reference Files** @@ -301,11 +297,12 @@ HashStore (calling `store_object(stream)`). ## Concurrency in HashStore -HashStore is both thread and process safe, and by default synchronizes calls to store & delete -objects/metadata with Python's threading module. If you wish to use multiprocessing to parallelize -your application, please declare a global environment variable `USE_MULTIPROCESSING` as `True` -before initializing Hashstore. This will direct the relevant Public API calls to synchronize using -the Python `multiprocessing` module's locks and conditions. Please see below for example: +HashStore is both threading and multiprocessing safe, and by default synchronizes calls to store & +delete objects/metadata with Python's threading module. If you wish to use multiprocessing to +parallelize your application, please declare a global environment variable `USE_MULTIPROCESSING` +as `True` before initializing Hashstore. This will direct the relevant Public API calls to +synchronize using the Python `multiprocessing` module's locks and conditions. +Please see below for example: ```py import os @@ -324,7 +321,8 @@ build tool. To install `hashstore` locally, create a virtual environment for python 3.9+, install poetry, and then install or build the package with `poetry install` or `poetry build`, -respectively. +respectively. Note, installing `hashstore` with poetry will also make the `hashstore` command +available through the command line terminal (see `HashStore Client` section below for details). To run tests, navigate to the root directory and run `pytest`. The test suite contains tests that take a longer time to run (relating to the storage of large files) - to execute all tests, run @@ -334,14 +332,13 @@ take a longer time to run (relating to the storage of large files) - to execute Client API Options: -- `-getchecksum` (get_hex_digest) -- `-findobject` - `-storeobject` - `-storemetadata` - `-retrieveobject` - `-retrievemetadata` - `-deleteobject` - `-deletemetadata` +- `-getchecksum` (get_hex_digest) How to use HashStore client (command line app) From 4b491d0c872088e0bc94e433c1e28054dd305953 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 1 Oct 2024 17:14:05 -0700 Subject: [PATCH 420/420] Minor formatting fix to 'README.md' DOI underneath version --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2ad5cc66..7dd0bd32 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ## HashStore: hash-based object storage for DataONE data packages Version: 1.1.0 -DOI: [doi:10.18739/A2ZG6G87Q](https://doi.org/10.18739/A2ZG6G87Q) +- DOI: [doi:10.18739/A2ZG6G87Q](https://doi.org/10.18739/A2ZG6G87Q) ## Contributors