Skip to content

Commit

Permalink
add strip_exclusion_formatting flag
Browse files Browse the repository at this point in the history
  • Loading branch information
ethan-cartwright committed Dec 22, 2023
1 parent 9fd8fb6 commit 0e36df2
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 3 deletions.
8 changes: 7 additions & 1 deletion datahub-classify/src/datahub_classify/infotype_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from datahub_classify.constants import EXCLUDE_NAME
from datahub_classify.helper_classes import ColumnInfo, InfotypeProposal
from datahub_classify.infotype_utils import strip_formatting
from datahub_classify.infotype_utils import perform_basic_checks

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -46,6 +47,7 @@ def predict_infotypes(
logger.debug("===========================================================")
basic_checks_failed_columns = []
num_cols_with_infotype_assigned = 0
strip_exclusion_formatting = global_config.get("strip_exclusion_formatting")

for column_info in column_infos:
logger.debug(
Expand All @@ -59,7 +61,11 @@ def predict_infotypes(

# convert exclude_name list into a set for o(1) checking
if EXCLUDE_NAME in config_dict and config_dict[EXCLUDE_NAME] is not None:
config_dict[EXCLUDE_NAME] = set(config_dict[EXCLUDE_NAME])
config_dict[EXCLUDE_NAME] = (
set(config_dict[EXCLUDE_NAME])
if not strip_exclusion_formatting
else set([strip_formatting(s) for s in config_dict[EXCLUDE_NAME]])
)
else:
config_dict[EXCLUDE_NAME] = set()

Expand Down
11 changes: 11 additions & 0 deletions datahub-classify/src/datahub_classify/infotype_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
logger = logging.getLogger(__name__)


def strip_formatting(s):
s = s.lower()
s = re.sub(r"[^a-z0-9\s]", "", s)
return s


# TODO: Exception handling
# Match regex for Name and Description
def match_regex(text_to_match: str, regex_list: List[str]) -> float:
Expand Down Expand Up @@ -91,6 +97,11 @@ def perform_basic_checks(
minimum_values_threshold: int,
) -> bool:
basic_checks_status = True
metadata.name = (
metadata.name
if not config_dict.get("strip_formatting")
else strip_formatting(metadata.name)
)
if (
config_dict[PREDICTION_FACTORS_AND_WEIGHTS].get(VALUES, None)
and len(values) < minimum_values_threshold
Expand Down
4 changes: 2 additions & 2 deletions datahub-classify/src/datahub_classify/reference_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
input1 = {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 1,
"Name": 0.4,
"Description": 0,
"Datatype": 0,
"Values": 0,
"Values": 0.6,
},
"ExcludeName": ["email_sent", "email_recieved"],
"Name": {
Expand Down

0 comments on commit 0e36df2

Please sign in to comment.