Skip to content

Commit

Permalink
Add domain frequencies to validation report (#47)
Browse files Browse the repository at this point in the history
* Hover state of "logo" el

* Add about and card component/macro

* Update 4xx templates

* Add data checklist view

* Mapping index

* Isolate alpine component for testing

* Test MappingForm component and validation logic

* Add linting, update ci action

* Update actions

* Adopt convention of using calling parenthesis when using alpine.js components in templates

* Work on create mapping form

* Update sidebar

* Stub out routes for data submissions CRUD

* Work on create mapping use case

* Add show get_by_id method to map repo

* Add use case to get a column map entity

* Trying to tighten up consistency of terminology re ColumnMap entity

* Start working on display of existing column_maps

* Fix language inconsistencies

* Tweak formatting of component

* First pass at mapping validation

* Test mapping validation logic

* Format

* Fixes after manual testing

* Create mapping happy path

* Work on show mapping

* Stub out form that can update a required field, edit an optional field, and add a new optional field

* Work on forms

* Successfully use form

* Testing and tweaking formatting

* Adjusting formatting

* Adjust button container

* Update font of data type in checklist\

* Hack to allow for space at bottom of mapping form

* Use card component in data submission show template

* Pull tweaks into component files

* Update seed script, fix storage download_temp method

* Format login page

* Test, lint and format

* Show created date in mapping index

* Move required fields to static prop of ColumnMap entity

* Shift add form to show.html

* Rework edit form

* Testing workflow

* Repositioning buttons

* Update type of component

* Lint and format

* Test column map use cases

* Rename domain dir core to avoid confusion with gis terminology

* Tweak pending items

* Add domain frequencies to validation report

* Update column_map route handlers

* Remove hidden PUT fields and hook

* Display user feedback for invalid mappings

* Display updated date for mappings

* Add rudimentary required field indicator to edit form

* Initialize report to empty list

* Format

* Ad updated_at to fake column map repo

* Basic test for column_map index method

* Extract file reading to application layer from controller

* Update import statement

* Test and lint frontend code

---------

Co-authored-by: Andy Kuny <[email protected]>
Co-authored-by: = <=>
  • Loading branch information
sb-2011 and akuny authored Mar 25, 2024
1 parent 45ce15c commit 7e1d160
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 13 deletions.
9 changes: 3 additions & 6 deletions nad_ch/application/dtos.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,9 @@ class DataSubmissionReportFeature:
invalid_domain_count: int = 0
valid_domain_count: int = 0
invalid_domains: List[str] = field(default_factory=list)
# TODO: Add frequency charts for each field and only take the top 10 if
# more than 10 values exist
# invalid_domain_frequencies: Dict[str, int]
# Set to True if invalid_domains & invalid_domain_frequencies doesn't contain
# a full list of unique domains found in source data
# invalid_domain_list_truncated: bool = False
domain_frequency: Dict[str, Dict[str, int]] = field(default_factory=dict)
# Set to true when there is too many unexpected domain values found for a field
high_domain_cardinality: bool = False


@dataclass
Expand Down
31 changes: 24 additions & 7 deletions nad_ch/application/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import glob
from pathlib import Path
from nad_ch.core.entities import ColumnMap
from collections import Counter


class DataValidator:
Expand Down Expand Up @@ -90,7 +91,7 @@ def update_feature_details(self, gdf: GeoDataFrame):
feature_submission.populated_count += populated_count
feature_submission.null_count += null_count

# Update domain specific metrics
# Update invalid domain metrics
column_domain_dict = self.domains["domain"].get(column)
column_mapper_dict = self.domains["mapper"].get(column)
if column_domain_dict and column_mapper_dict:
Expand Down Expand Up @@ -124,12 +125,28 @@ def update_feature_details(self, gdf: GeoDataFrame):
)
feature_submission.invalid_domain_count += invalid_domain_count
feature_submission.valid_domain_count += valid_domain_count
# Can only store up to 10 invalid domains per nad field
invalid_domain_unique_count = len(invalid_domains)
remaining_slots = 10 - len(feature_submission.invalid_domains)
if invalid_domain_unique_count and remaining_slots > 0:
invalid_domains = invalid_domains[:remaining_slots]
feature_submission.invalid_domains.extend(invalid_domains)
# Can only store up to 100 invalid domains per nad field
remaining_slots = 100 - len(feature_submission.invalid_domains)
if invalid_domains and remaining_slots > 0:
feature_submission.invalid_domains.extend(
invalid_domains[:remaining_slots]
)

# Generate frequency table of fields that are domain specific only
if column_domain_dict:
domain_freq = gdf[column].value_counts().to_dict()
if feature_submission.domain_frequency:
domain_freq = dict(
Counter(feature_submission.domain_frequency)
+ Counter(domain_freq)
)
# Check if the number of unique domains in frequency dictionary
# is 2x greater than maximum expected unique domains
if len(domain_freq.keys()) > 2 * len(column_domain_dict.keys()):
feature_submission.high_domain_cardinality = True
# Reset domain frequency
domain_freq = {}
feature_submission.domain_frequency = domain_freq

def update_overview_details(self, gdf: GeoDataFrame):
self.report_overview.records_count += self.get_record_count(gdf)
Expand Down
2 changes: 2 additions & 0 deletions tests/application/test_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def test_to_dict_with_numpy_types():
"valid_domain_count": 90,
"invalid_domain_count": 10,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
}
assert isinstance(feature_dict["populated_count"], int)
assert isinstance(feature_dict["null_count"], float)
Expand Down
66 changes: 66 additions & 0 deletions tests/application/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,72 @@ def test_update_feature_details():
assert feature.valid_domain_count == 0
assert feature.invalid_domains == ["Anycounty"]

# Domain frequency assertions
for nad_field in ("St_PreSep", "St_PreTyp", "St_PosDir"):
assert data_validator.report_features.get(nad_field).domain_frequency == {}
assert data_validator.report_features.get("State").domain_frequency == {"IN": 10}
assert data_validator.report_features.get("St_PosTyp").domain_frequency == {
"Street": 10
}
assert data_validator.report_features.get("St_PreDir").domain_frequency == {
"South": 10
}
assert data_validator.report_features.get("Placement").domain_frequency == {
"Structure - Rooftop": 10
}
assert data_validator.report_features.get("County").domain_frequency == {
"Anycounty": 10
}
assert all(
data_validator.report_features.get(field).high_domain_cardinality is False
for field in data_validator.report_features.keys()
)


def test_update_feature_details_force_high_domain_cardinality():
gdf = create_fake_geopandas_dataframe(num_rows=200)
gdf["St_PreDir"] = [f"PreDirection{i}" for i in range(len(gdf))]
gdf.loc[[10, 20], "St_PreDir"] = "Northeast"
gdf["Placement"] = [f"Place{i}" for i in range(len(gdf))]
gdf.loc[[10, 20], "Placement"] = "Parcel - Centroid"
column_map = create_fake_column_map_from_gdf(gdf)
data_validator = DataValidator(column_map)
data_validator.initialize_overview_details(gdf, column_map)
data_validator.update_feature_details(gdf)

# Invalid Domain assertions
for field in ("St_PreDir", "Placement"):
feature = data_validator.report_features.get(field)
assert feature.invalid_domain_count == 198
assert feature.valid_domain_count == 2
# The first 100 invalid domains that were saved
assert len(feature.invalid_domains) == 100
assert all(
domain in feature.invalid_domains
for domain in gdf[field].to_list()[:102]
if domain not in ("Parcel - Centroid", "Northeast")
)
# Invalid domains that were NOT saved after reaching max of 100
assert all(
domain not in feature.invalid_domains
for domain in gdf[field].to_list()[102:]
)

# High domain cardinality assertions
assert all(
data_validator.report_features.get(field).high_domain_cardinality is False
for field in data_validator.report_features.keys()
if field not in ("St_PreDir", "Placement")
)
assert all(
data_validator.report_features.get(field).high_domain_cardinality is True
for field in ("St_PreDir", "Placement")
)
assert all(
data_validator.report_features.get(field).domain_frequency == {}
for field in ("St_PreDir", "Placement")
)


def test_initialize_overview_details():
gdf = create_fake_geopandas_dataframe(num_rows=1)
Expand Down
80 changes: 80 additions & 0 deletions tests/test_data/baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "ST",
Expand All @@ -321,6 +323,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "ZIP",
Expand All @@ -330,6 +334,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "RuleID",
Expand All @@ -339,6 +345,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "geometry",
Expand All @@ -348,6 +356,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
],
}
Expand Down Expand Up @@ -384,6 +394,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "ADD_SUFFIX",
Expand All @@ -393,6 +405,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "STR_DIR",
Expand All @@ -402,6 +416,15 @@
"invalid_domain_count": 6,
"valid_domain_count": 154,
"invalid_domains": ["northerns", "southerns"],
"domain_frequency": {
"N": 50,
"W": 36,
"S": 35,
"E": 33,
"northerns": 3,
"southerns": 3,
},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "STR_PRETYP",
Expand All @@ -411,6 +434,13 @@
"invalid_domain_count": 0,
"valid_domain_count": 10,
"invalid_domains": [],
"domain_frequency": {
"STHY": 5,
"CALLE": 2,
"CAMINO": 2,
"NEW MEXICO HWY": 1,
},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "STR_NAME",
Expand All @@ -420,6 +450,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "STR_SUFFIX",
Expand All @@ -429,6 +461,27 @@
"invalid_domain_count": 8,
"valid_domain_count": 831,
"invalid_domains": ["Drive Parkway", "Crossings Drive", "Unknown Drive"],
"domain_frequency": {
"RD": 178,
"ST": 167,
"DR": 164,
"AVE": 126,
"LN": 46,
"CT": 36,
"PL": 25,
"BLVD": 22,
"TRL": 18,
"WAY": 18,
"CIR": 14,
"LOOP": 12,
"Crossings Drive": 3,
"Drive Parkway": 3,
"PKWY": 3,
"Unknown Drive": 2,
"HWY": 1,
"RD.": 1,
},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "POST_DIR",
Expand All @@ -438,6 +491,17 @@
"invalid_domain_count": 0,
"valid_domain_count": 328,
"invalid_domains": [],
"domain_frequency": {
"NE": 159,
"NW": 64,
"SE": 51,
"SW": 48,
"N": 2,
"S": 2,
"E": 1,
"W": 1,
},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "ROAD_LABEL",
Expand All @@ -447,6 +511,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "COMNAME",
Expand All @@ -456,6 +522,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "MSAG_COM",
Expand All @@ -465,6 +533,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "ZIPCODE",
Expand All @@ -474,6 +544,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "DPID",
Expand All @@ -483,6 +555,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "DATE_UPD",
Expand All @@ -492,6 +566,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "last_edi_1",
Expand All @@ -501,6 +577,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
{
"provided_feature_name": "EXCEPTION",
Expand All @@ -510,6 +588,8 @@
"invalid_domain_count": 0,
"valid_domain_count": 0,
"invalid_domains": [],
"domain_frequency": {},
"high_domain_cardinality": False,
},
],
}

0 comments on commit 7e1d160

Please sign in to comment.