diff --git a/src/aind_metadata_viz/database.py b/src/aind_metadata_viz/database.py index 74b7527..64ac043 100644 --- a/src/aind_metadata_viz/database.py +++ b/src/aind_metadata_viz/database.py @@ -6,7 +6,7 @@ import param import os import numpy as np - +import time from io import StringIO from aind_data_schema_models.modalities import ( @@ -14,13 +14,11 @@ ExpectedFiles, FileRequirement, ) -from aind_metadata_viz.metadata_helpers import ( - process_record_list, -) from aind_metadata_viz.metadata_class_map import ( first_layer_field_mapping, second_layer_field_mappings, ) +from aind_metadata_viz.utils import METASTATE_MAP API_GATEWAY_HOST = os.getenv("API_GATEWAY_HOST", "api.allenneuraldynamics-test.org") DATABASE = os.getenv("DATABASE", "metadata_index") @@ -81,8 +79,12 @@ def __init__( ): """Initialize""" # get data - self._file_data = _get_file_presence(test_mode=test_mode) + start = time.time() + self._file_data = _get_metadata(test_mode=test_mode) + print(time.time() - start) + start = time.time() self._status_data = _get_status() + print(time.time() - start) # inner join only keeps records that are in both dataframes self.data = pd.merge(self._file_data, self._status_data, on="_id", how="inner") @@ -278,29 +280,22 @@ def get_csv(self, vp_state: str = "Not Valid/Present"): sio = StringIO() df.to_csv(sio, index=False) return sio.getvalue() - - -@pn.cache(ttl=CACHE_RESET_DAY) -def _get_metadata(test_mode=False) -> pd.DataFrame: - """Get the metadata fields, modality, derived, name, location, created - - Parameters - ---------- - test_mode : bool, optional - _description_, by default False - """ def _get_status() -> pd.DataFrame: """Get the status of the metadata """ response = rds_client.read_table(RDS_TABLE_NAME) + + # replace values using the int -> string map + response.replace(METASTATE_MAP, inplace=True) + return response @pn.cache(ttl=CACHE_RESET_DAY) -def _get_file_presence(test_mode=False) -> pd.DataFrame: - """Get all and convert to data frame format +def _get_metadata(test_mode=False) -> pd.DataFrame: + """Get metadata about records in DocDB Parameters ---------- diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py deleted file mode 100644 index 8bb8170..0000000 --- a/src/aind_metadata_viz/metadata_helpers.py +++ /dev/null @@ -1,177 +0,0 @@ -from aind_metadata_viz.metadata_class_map import ( - first_layer_field_mapping, - second_layer_field_mappings, - first_layer_versions, -) -from aind_metadata_viz.utils import MetaState, expected_files_from_modalities -from aind_data_schema_models.modalities import FileRequirement -from pydantic import ValidationError -from typing import Literal, Optional, Union - - -def _metadata_present_helper(json: str, check_present: bool = True): - """Return true if the value of a key exists and is not None, or any of - '' [] {} in a JSON object - - Parameters - ---------- - field : string - Key - object : dict - Dictionary - """ - present = json is not None and json != "" and json != [] and json != {} - - if check_present: - return "present" if present else "absent" - else: - return "absent" if present else "present" - - -def _metadata_valid_helper( - field: str, - json: str, - mapping: dict, -): - """Return true if the json data is a valid object of the particular field class - - Parameters - ---------- - json : str - json string generated from a AindCoreModel dump - """ - if "schema_version" in json: - # force the schema version to match the current one - json["schema_version"] = first_layer_versions[field] - - if field in mapping: - expected_type = mapping[field] - try: - origin_type = getattr(expected_type, "__origin__", None) - - if origin_type is list: - item_type = expected_type.__args__[0] - return all([item_type(**item_json) for item_json in json]) - elif origin_type is Optional: - # skip optional fields! - return True - elif origin_type is Union: - # Get all possible types in the Union - union_types = get_args(expected_type) - - for union_type in union_types: - try: - return union_type(**json) - except ValidationError: - continue - else: - return False - else: - # validate as a pydantic model - return expected_type(**json) is not None - except Exception as e: - print(e) - return False - - -def check_metadata_state(field: str, object: dict, parent: str = None) -> str: - """Get the MetaState for a specific key in a dictinoary - - Parameters - ---------- - key : str - Field to check - object : dict - {field: value} - - Returns - ------- - MetaState - _description_ - """ - # if excluded, just return that - # get the excluded fields from the class map - - if not object: - return MetaState.MISSING.value - - if ( - "data_description" in object - and object["data_description"] - and "modality" in object["data_description"] - ): - modality_map = expected_files_from_modalities( - modalities=object["data_description"]["modality"] - ) - - if field in modality_map: - file_req = modality_map[field] - else: - print( - f"Warning: field {field} had incorrect modalities, so no file requirement is defined" - ) - file_req = FileRequirement.REQUIRED - else: - # default to required - print( - f"Warning: object had no data description modalities, so no file requirement is defined" - ) - file_req = FileRequirement.REQUIRED - - # Excluded files we ignore, just return that it was excluded - if file_req == FileRequirement.EXCLUDED: - return MetaState.EXCLUDED.value - - # File is required or optional, get the mappings from field -> class - # if you're looking at a parent file's data then you need a different mapping - if parent: - class_map = second_layer_field_mappings[parent] - # we're at the top level, just check the first layer mappings - else: - class_map = first_layer_field_mapping - - # First check that the key exists at all and is not None - if field in object and object[field]: - value = object[field] - else: - if file_req == FileRequirement.OPTIONAL: - return MetaState.OPTIONAL.value - else: - return MetaState.MISSING.value - - # attempt validation - if _metadata_valid_helper(field, value, class_map): - return MetaState.VALID.value - - # validation failed, check if the field is present or if it's empty - - # check empty - if _metadata_present_helper(value): - return MetaState.PRESENT.value - else: - if file_req == FileRequirement.OPTIONAL: - return MetaState.OPTIONAL.value - else: - return MetaState.MISSING.value - - -def process_record_list(record_list: list, expected_fields: list, parent=None): - """Process a list of Metadata JSON records from DocDB - - For each record, check each of the expected fields and see if they are valid/present/missing/excluded - - Parameters - ---------- - data_list : list[dict] - List of metadata json records as dicts - expected_fields : list[str] - List of key fields to check - - Returns - ------- - list[{field: MetaState}] - """ - return [ - {field: check_metadata_state(field, data, parent) for field in expected_fields} - for data in record_list - ] diff --git a/src/aind_metadata_viz/utils.py b/src/aind_metadata_viz/utils.py index 7424db6..9838678 100644 --- a/src/aind_metadata_viz/utils.py +++ b/src/aind_metadata_viz/utils.py @@ -16,12 +16,23 @@ ] -class MetaState(str, Enum): - VALID = "valid" - PRESENT = "present" - OPTIONAL = "optional" - MISSING = "missing" - EXCLUDED = "excluded" +METASTATE_MAP = { + 2: "valid", + 1: "present", + 0: "optional", + -1: "missing", + -2: "excluded", + -3: "corrupt", +} + + +class MetadataState(int, Enum): + VALID = 2 # validates as it's class + PRESENT = 1 # present + OPTIONAL = 0 # missing, but it's optional + MISSING = -1 # missing, and it's required + EXCLUDED = -2 # excluded for all modalities in the metadata + CORRUPT = -3 # corrupt, can't be loaded from json REMAPS = { diff --git a/tests/test_metadata_helpers.py b/tests/test_metadata_helpers.py deleted file mode 100644 index 0858b3c..0000000 --- a/tests/test_metadata_helpers.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Test the main app code""" - -import unittest -from aind_metadata_viz.metadata_helpers import ( - _metadata_present_helper, - process_present_dict, - process_record_list, -) - - -class TestApp(unittest.TestCase): - """Test main app""" - - def setUp(self) -> None: - self.dict = { - "test1": None, - "test2": "", - "test3": {}, - "test4": [], - "test5": "actual data", - "test6": 1, - "test7": {"actual key": "actual value"}, - "test8": object, - } - self.expected_fields = [ - "test1", - "test2", - "test3", - "test4", - "test5", - "test6", - "test7", - "test8", - "meow", - ] - self.expected_out = { - "test1": False, - "test2": False, - "test3": False, - "test4": False, - "test5": True, - "test6": True, - "test7": True, - "test8": True, - "meow": False, - } - - return super().setUp() - - def test_check_present(self): - """Test the check_present function""" - self.assertFalse(_metadata_present_helper("test1", self.dict)) - self.assertFalse(_metadata_present_helper("test2", self.dict)) - self.assertFalse(_metadata_present_helper("test3", self.dict)) - self.assertFalse(_metadata_present_helper("test4", self.dict)) - - self.assertTrue(_metadata_present_helper("test5", self.dict)) - self.assertTrue(_metadata_present_helper("test6", self.dict)) - self.assertTrue(_metadata_present_helper("test7", self.dict)) - self.assertTrue(_metadata_present_helper("test8", self.dict)) - - self.assertFalse( - _metadata_present_helper("test8", self.dict, check_present=False) - ) - - def test_process_present_dict(self): - """Test the process_present_dict function""" - out_test = process_present_dict(self.dict, self.expected_fields) - - self.assertEqual(self.expected_out, out_test) - - def test_process_present(self): - """Test that process runs properly on a list""" - data_list = [self.dict, self.dict] - - processed_list = process_record_list(data_list, self.expected_fields) - out_list = [self.expected_out, self.expected_out] - - self.assertEqual(processed_list, out_list) - - -if __name__ == "__main__": - unittest.main()