From 0e36df2739aaac654dba12a3c023601c7c9ad775 Mon Sep 17 00:00:00 2001 From: Ethan Cartwright Date: Thu, 21 Dec 2023 20:13:07 -0500 Subject: [PATCH] add strip_exclusion_formatting flag --- .../src/datahub_classify/infotype_predictor.py | 8 +++++++- .../src/datahub_classify/infotype_utils.py | 11 +++++++++++ .../src/datahub_classify/reference_input.py | 4 ++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/datahub-classify/src/datahub_classify/infotype_predictor.py b/datahub-classify/src/datahub_classify/infotype_predictor.py index f736b65..44df24a 100644 --- a/datahub-classify/src/datahub_classify/infotype_predictor.py +++ b/datahub-classify/src/datahub_classify/infotype_predictor.py @@ -4,6 +4,7 @@ from datahub_classify.constants import EXCLUDE_NAME from datahub_classify.helper_classes import ColumnInfo, InfotypeProposal +from datahub_classify.infotype_utils import strip_formatting from datahub_classify.infotype_utils import perform_basic_checks logger = logging.getLogger(__name__) @@ -46,6 +47,7 @@ def predict_infotypes( logger.debug("===========================================================") basic_checks_failed_columns = [] num_cols_with_infotype_assigned = 0 + strip_exclusion_formatting = global_config.get("strip_exclusion_formatting") for column_info in column_infos: logger.debug( @@ -59,7 +61,11 @@ def predict_infotypes( # convert exclude_name list into a set for o(1) checking if EXCLUDE_NAME in config_dict and config_dict[EXCLUDE_NAME] is not None: - config_dict[EXCLUDE_NAME] = set(config_dict[EXCLUDE_NAME]) + config_dict[EXCLUDE_NAME] = ( + set(config_dict[EXCLUDE_NAME]) + if not strip_exclusion_formatting + else set([strip_formatting(s) for s in config_dict[EXCLUDE_NAME]]) + ) else: config_dict[EXCLUDE_NAME] = set() diff --git a/datahub-classify/src/datahub_classify/infotype_utils.py b/datahub-classify/src/datahub_classify/infotype_utils.py index a499625..d5e6173 100644 --- a/datahub-classify/src/datahub_classify/infotype_utils.py +++ b/datahub-classify/src/datahub_classify/infotype_utils.py @@ -12,6 +12,12 @@ logger = logging.getLogger(__name__) +def strip_formatting(s): + s = s.lower() + s = re.sub(r"[^a-z0-9\s]", "", s) + return s + + # TODO: Exception handling # Match regex for Name and Description def match_regex(text_to_match: str, regex_list: List[str]) -> float: @@ -91,6 +97,11 @@ def perform_basic_checks( minimum_values_threshold: int, ) -> bool: basic_checks_status = True + metadata.name = ( + metadata.name + if not config_dict.get("strip_formatting") + else strip_formatting(metadata.name) + ) if ( config_dict[PREDICTION_FACTORS_AND_WEIGHTS].get(VALUES, None) and len(values) < minimum_values_threshold diff --git a/datahub-classify/src/datahub_classify/reference_input.py b/datahub-classify/src/datahub_classify/reference_input.py index 7ab85cf..972ab63 100644 --- a/datahub-classify/src/datahub_classify/reference_input.py +++ b/datahub-classify/src/datahub_classify/reference_input.py @@ -3,10 +3,10 @@ input1 = { "Email_Address": { "Prediction_Factors_and_Weights": { - "Name": 1, + "Name": 0.4, "Description": 0, "Datatype": 0, - "Values": 0, + "Values": 0.6, }, "ExcludeName": ["email_sent", "email_recieved"], "Name": {