From d7d6aad4a24922ae1a6f7b313e0582562d0bbb51 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Sun, 10 Nov 2024 11:44:47 -0800 Subject: [PATCH 1/3] feat: developing tool to see validation errors for specific records --- pyproject.toml | 1 + src/aind_metadata_viz/app.py | 6 ++- src/aind_metadata_viz/hall_of_shame.py | 52 ++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 src/aind_metadata_viz/hall_of_shame.py diff --git a/pyproject.toml b/pyproject.toml index 87f82d3..54814e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ 'aind-data-access-api[docdb]', 'aind-data-access-api[rds]', 'aind-data-schema-models', + 'aind-metadata-validator', 'flask', ] diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index f947b0e..e35a95d 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -2,6 +2,7 @@ import altair as alt import pandas as pd from aind_metadata_viz import database +from aind_metadata_viz.hall_of_shame import HallOfShame from aind_data_schema import __version__ as ads_version pn.extension(design="material") @@ -355,8 +356,11 @@ def build_row(selected_modality, derived_filter): derived_filter=derived_selector, ) +# Build the hall of shame +hall_of_shame = HallOfShame() + # Put everything in a column and buffer it -main_col = pn.Column(top_row, mid_plot, styles=outer_style, width=515) +main_col = pn.Column(top_row, mid_plot, hall_of_shame.panel(), styles=outer_style, width=515) pn.Row(pn.HSpacer(), left_col, pn.Spacer(width=20), main_col, pn.HSpacer(), margin=20).servable( title="Metadata Portal", diff --git a/src/aind_metadata_viz/hall_of_shame.py b/src/aind_metadata_viz/hall_of_shame.py new file mode 100644 index 0000000..e96cd45 --- /dev/null +++ b/src/aind_metadata_viz/hall_of_shame.py @@ -0,0 +1,52 @@ +"""This class builds a Panel view that displays why a particular record is not validating""" + +import panel as pn +from aind_metadata_viz.database import docdb_api_client +# from aind_metadata_validator import + + +class HallOfShame(): + + def __init__(self): + """Create a new instance of the HallOfShame view""" + + self.field_selector = pn.widgets.Select(name="Field", options=["_id", "name"]) + self.input = pn.widgets.TextInput(name="Value") + self.exact_match = pn.widgets.Checkbox(name="Exact Match", value=False) + + def get_state(self): + """Return the current state of the view""" + + if self.exact_match.value: + query = { + self.field_selector.value: self.input.value + } + else: + query = { + self.field_selector.value: {"$regex": self.input.value} + } + + records = docdb_api_client.retrieve_docdb_records( + query=query, + limit=10 + ) + + self.data = records + validate() + + def validate(self): + """Check if the first record in the data is valid""" + print(self.data[0]) + + + def panel(self): + """Build the panel view""" + + # Top row with search input and option to match exactly + pn.row(self.field_selector, self.input, self.exact_match) + + # Second row with status information + status = pn.widgets.StaticText(value=f"Found {len(self.data)} records, validating first record only.") + + # Below that display the results + return pn.widgets.StaticText(value="Todo") From b02a15882ce2fcd6196b1ad6683a45d965d9fef1 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Thu, 14 Nov 2024 14:29:00 -0800 Subject: [PATCH 2/3] feat: cleaning up view, still need to add tables for status --- src/aind_metadata_viz/app.py | 34 +++++++++-- src/aind_metadata_viz/database.py | 82 +++++++++++++++++++++----- src/aind_metadata_viz/hall_of_shame.py | 52 ---------------- src/aind_metadata_viz/temp.py | 46 --------------- 4 files changed, 94 insertions(+), 120 deletions(-) delete mode 100644 src/aind_metadata_viz/hall_of_shame.py delete mode 100644 src/aind_metadata_viz/temp.py diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index e35a95d..8b0e614 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -2,7 +2,6 @@ import altair as alt import pandas as pd from aind_metadata_viz import database -from aind_metadata_viz.hall_of_shame import HallOfShame from aind_data_schema import __version__ as ads_version pn.extension(design="material") @@ -314,6 +313,11 @@ def hd_style(text): header_pane = pn.pane.Markdown(header, styles=outer_style, width=420) + +total_md = f"

{db.get_overall_valid():1.2f}% of all metadata records are fully {hd_style('valid')}

" + +percent_total = pn.pane.Markdown(total_md, styles=outer_style, width=420) + download_pane = pn.pane.Markdown(download_md) control_col = pn.Column( @@ -331,6 +335,7 @@ def hd_style(text): # Left column (controls) left_col = pn.Column( header_pane, + percent_total, control_col, width=420, ) @@ -356,12 +361,29 @@ def build_row(selected_modality, derived_filter): derived_filter=derived_selector, ) -# Build the hall of shame -hall_of_shame = HallOfShame() - # Put everything in a column and buffer it -main_col = pn.Column(top_row, mid_plot, hall_of_shame.panel(), styles=outer_style, width=515) +main_col = pn.Column(top_row, mid_plot, styles=outer_style, width=515) + +main_row = pn.Row(pn.HSpacer(), left_col, pn.Spacer(width=20), main_col, pn.HSpacer(), margin=20) + +# Add the validator section + +validator_name_selector = pn.widgets.TextInput(name="Enter asset name to validate:", value="single-plane-ophys_655019_2023-04-03_18-17-55", width=800) +pn.state.location.sync(validator_name_selector, {"value": "validator_name"}) + +validator = database.RecordValidator(validator_name_selector.value) + + +def build_validator(validator_name): + validator.update(validator_name) + col = pn.Column(validator_name_selector, validator.panel(), width=(515+20+420), styles=outer_style) + row = pn.Row(pn.HSpacer(), col, pn.HSpacer()) + return row + + +validator_row = pn.bind(build_validator, + validator_name=validator_name_selector) -pn.Row(pn.HSpacer(), left_col, pn.Spacer(width=20), main_col, pn.HSpacer(), margin=20).servable( +pn.Column(main_row, validator_row).servable( title="Metadata Portal", ) diff --git a/src/aind_metadata_viz/database.py b/src/aind_metadata_viz/database.py index 2def1cb..b9f1c00 100644 --- a/src/aind_metadata_viz/database.py +++ b/src/aind_metadata_viz/database.py @@ -1,12 +1,14 @@ from aind_data_access_api.document_db import MetadataDbClient from aind_data_access_api.rds_tables import RDSCredentials from aind_data_access_api.rds_tables import Client +from aind_metadata_validator.metadata_validator import validate_metadata import panel as pn import pandas as pd import param import os import numpy as np -import time +import io +import logging from io import StringIO from aind_data_schema_models.modalities import ( @@ -79,12 +81,8 @@ def __init__( ): """Initialize""" # get data - start = time.time() self._file_data = _get_metadata(test_mode=test_mode) - print(time.time() - start) - start = time.time() self._status_data = _get_status() - print(time.time() - start) # inner join only keeps records that are in both dataframes self.data = pd.merge(self._file_data, self._status_data, on="_id", how="inner") @@ -159,6 +157,10 @@ def get_expected_files(self) -> tuple[list[str], list[str]]: return (expected_files_by_modality, excluded_files_by_modality) + def get_overall_valid(self): + """Get the percentage of valid records""" + return np.sum(self.data['metadata'].values=='valid') / len(self.data) * 100 + def get_file_presence(self): """Get the presence of a list of files @@ -350,15 +352,63 @@ def _get_metadata(test_mode=False) -> pd.DataFrame: ) -@pn.cache(ttl=CACHE_RESET_DAY) -def _get_all(test_mode=False): - filter = {} - limit = 0 if not test_mode else 10 - paginate_batch_size = 500 - response = docdb_api_client.retrieve_docdb_records( - filter_query=filter, - limit=limit, - paginate_batch_size=paginate_batch_size, - ) +class RecordValidator(): - return response + def __init__(self, id): + """Populate the validator with a record and run validation + + Parameters + ---------- + id : _type_ + _description_ + """ + self.update(id) + self.state = None + self.log = None + + def update(self, name): + + records = docdb_api_client.retrieve_docdb_records(filter_query={"name": name}) + print(records) + + if len(records) > 0: + self.record = records[0] + else: + self.state = None + self.log = None + return + + # Create an in-memory buffer to capture log output + log_capture_string = io.StringIO() + + # Set up a custom handler that writes to the buffer + ch = logging.StreamHandler(log_capture_string) + ch.setLevel(logging.INFO) # Adjust level as needed + + # Get the logger used in `validate_metadata` + logger = logging.getLogger() + logger.addHandler(ch) + + # run the validator, capturing any errors + self.state = validate_metadata(self.record) + + ch.flush() + self.log = log_capture_string.getvalue() + logger.removeHandler(ch) + log_capture_string.close() + + print(self.state) + print(self.log) + + def panel(self): + """Return a panel object with the validation results""" + if self.state is None: + return pn.pane.Markdown("No record was found.") + else: + state = pn.pane.Markdown(f"Validation state: {self.state}") + + log = pn.widgets.TextAreaInput(value=self.log, height=400, width=515) + log.disabled = True + + return pn.Column(state, log, width=515) + # return (self.state, self.log) diff --git a/src/aind_metadata_viz/hall_of_shame.py b/src/aind_metadata_viz/hall_of_shame.py deleted file mode 100644 index e96cd45..0000000 --- a/src/aind_metadata_viz/hall_of_shame.py +++ /dev/null @@ -1,52 +0,0 @@ -"""This class builds a Panel view that displays why a particular record is not validating""" - -import panel as pn -from aind_metadata_viz.database import docdb_api_client -# from aind_metadata_validator import - - -class HallOfShame(): - - def __init__(self): - """Create a new instance of the HallOfShame view""" - - self.field_selector = pn.widgets.Select(name="Field", options=["_id", "name"]) - self.input = pn.widgets.TextInput(name="Value") - self.exact_match = pn.widgets.Checkbox(name="Exact Match", value=False) - - def get_state(self): - """Return the current state of the view""" - - if self.exact_match.value: - query = { - self.field_selector.value: self.input.value - } - else: - query = { - self.field_selector.value: {"$regex": self.input.value} - } - - records = docdb_api_client.retrieve_docdb_records( - query=query, - limit=10 - ) - - self.data = records - validate() - - def validate(self): - """Check if the first record in the data is valid""" - print(self.data[0]) - - - def panel(self): - """Build the panel view""" - - # Top row with search input and option to match exactly - pn.row(self.field_selector, self.input, self.exact_match) - - # Second row with status information - status = pn.widgets.StaticText(value=f"Found {len(self.data)} records, validating first record only.") - - # Below that display the results - return pn.widgets.StaticText(value="Todo") diff --git a/src/aind_metadata_viz/temp.py b/src/aind_metadata_viz/temp.py deleted file mode 100644 index f51df84..0000000 --- a/src/aind_metadata_viz/temp.py +++ /dev/null @@ -1,46 +0,0 @@ -from aind_metadata_viz.metadata_helpers import * -from aind_metadata_viz.docdb import _get_all -import json -from aind_data_schema_models.modalities import ( - Modality, - ExpectedFiles, - FileRequirement, -) -from aind_metadata_viz.metadata_helpers import ( - process_record_list, -) -from aind_metadata_viz.metadata_class_map import ( - first_layer_field_mapping, - second_layer_field_mappings, -) -ALL_FILES = sorted( - [ - "data_description", - "acquisition", - "procedures", - "subject", - "instrument", - "processing", - "rig", - "session", - "quality_control", - ] -) - - -# records = _get_all() - -with open('data.json', 'r') as f: - record_list = json.loads(f.read()) - - -file_dfs = {} -# filter by file -for file in ALL_FILES: - expected_fields = second_layer_field_mappings[file] - # get field presence - field_record_list = [record[file] if file in record else None for record in record_list] - - processed = process_record_list(field_record_list, expected_fields, parent=file) - - print(processed) From 18c50d9b3c48080233c81ad72f08c9cdbbf6d173 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Thu, 14 Nov 2024 14:44:40 -0800 Subject: [PATCH 3/3] feat: functional version of hall-of-shame --- src/aind_metadata_viz/app.py | 26 +++++++------------------- src/aind_metadata_viz/database.py | 28 +++++++++++++++++----------- src/aind_metadata_viz/utils.py | 15 +++++++++++++++ 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index 8b0e614..b84d57e 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -2,21 +2,13 @@ import altair as alt import pandas as pd from aind_metadata_viz import database +from aind_metadata_viz.utils import hd_style, AIND_COLORS from aind_data_schema import __version__ as ads_version pn.extension(design="material") pn.extension("vega") alt.themes.enable("ggplot2") -AIND_COLORS = colors = { - "dark_blue": "#003057", - "light_blue": "#2A7DE1", - "green": "#1D8649", - "yellow": "#FFB71B", - "grey": "#7C7C7F", - "red": "#FF5733", -} - # Define CSS to set the background color background_color = AIND_COLORS[pn.state.location.query_params["background"] if "background" in pn.state.location.query_params else "dark_blue"] css = f""" @@ -281,20 +273,16 @@ def update_selection(event): return pane -def hd_style(text): - return ( - f"{text}" - ) header = ( f"# Metadata Portal\n\n" "This app steps through all of the metadata stored in DocDB and determines whether every record's fields " "(and subfields) are " - f"{hd_style('valid')} for aind-data-schema v{ads_version}, " - f"{hd_style('present')} but invalid, {hd_style('optional')}, " - f"{hd_style('missing')}, or " - f"{hd_style('excluded')} for the record's modality." + f"{hd_style('valid', colors)} for aind-data-schema v{ads_version}, " + f"{hd_style('present', colors)} but invalid, {hd_style('optional', colors)}, " + f"{hd_style('missing', colors)}, or " + f"{hd_style('excluded', colors)} for the record's modality." ) download_md = """ @@ -314,7 +302,7 @@ def hd_style(text): header_pane = pn.pane.Markdown(header, styles=outer_style, width=420) -total_md = f"

{db.get_overall_valid():1.2f}% of all metadata records are fully {hd_style('valid')}

" +total_md = f"

{db.get_overall_valid():1.2f}% of all metadata records are fully {hd_style('valid', colors)}

" percent_total = pn.pane.Markdown(total_md, styles=outer_style, width=420) @@ -371,7 +359,7 @@ def build_row(selected_modality, derived_filter): validator_name_selector = pn.widgets.TextInput(name="Enter asset name to validate:", value="single-plane-ophys_655019_2023-04-03_18-17-55", width=800) pn.state.location.sync(validator_name_selector, {"value": "validator_name"}) -validator = database.RecordValidator(validator_name_selector.value) +validator = database.RecordValidator(validator_name_selector.value, colors) def build_validator(validator_name): diff --git a/src/aind_metadata_viz/database.py b/src/aind_metadata_viz/database.py index b9f1c00..e8f779c 100644 --- a/src/aind_metadata_viz/database.py +++ b/src/aind_metadata_viz/database.py @@ -20,7 +20,7 @@ first_layer_field_mapping, second_layer_field_mappings, ) -from aind_metadata_viz.utils import METASTATE_MAP +from aind_metadata_viz.utils import METASTATE_MAP, hd_style API_GATEWAY_HOST = os.getenv("API_GATEWAY_HOST", "api.allenneuraldynamics-test.org") DATABASE = os.getenv("DATABASE", "metadata_index") @@ -354,7 +354,7 @@ def _get_metadata(test_mode=False) -> pd.DataFrame: class RecordValidator(): - def __init__(self, id): + def __init__(self, id, colors): """Populate the validator with a record and run validation Parameters @@ -365,6 +365,7 @@ def __init__(self, id): self.update(id) self.state = None self.log = None + self.colors = colors def update(self, name): @@ -397,18 +398,23 @@ def update(self, name): logger.removeHandler(ch) log_capture_string.close() - print(self.state) - print(self.log) - def panel(self): """Return a panel object with the validation results""" if self.state is None: return pn.pane.Markdown("No record was found.") else: - state = pn.pane.Markdown(f"Validation state: {self.state}") - - log = pn.widgets.TextAreaInput(value=self.log, height=400, width=515) - log.disabled = True - - return pn.Column(state, log, width=515) + print(self.state["metadata"].value) + state = pn.pane.Markdown(f""" +Overall metadata: {hd_style(METASTATE_MAP[self.state["metadata"].value], self.colors)} +""") + file_state = {} + for file in ALL_FILES: + file_state[file] = hd_style(METASTATE_MAP[self.state[file].value], self.colors) + print(file_state) + df = pd.DataFrame(file_state, index=[0]) + file_state = pn.pane.DataFrame(df, width=920, escape=False) + + log = pn.pane.Markdown(self.log, width=920) + + return pn.Column(state, file_state, log, width=515) # return (self.state, self.log) diff --git a/src/aind_metadata_viz/utils.py b/src/aind_metadata_viz/utils.py index 9838678..5018df6 100644 --- a/src/aind_metadata_viz/utils.py +++ b/src/aind_metadata_viz/utils.py @@ -48,6 +48,21 @@ class MetadataState(int, Enum): "EXASPIM": "SPIM", } +AIND_COLORS = { + "dark_blue": "#003057", + "light_blue": "#2A7DE1", + "green": "#1D8649", + "yellow": "#FFB71B", + "grey": "#7C7C7F", + "red": "#FF5733", +} + + +def hd_style(text, colors): + return ( + f"{text}" + ) + def expected_files_from_modalities( modalities: list[str],