From 7fa11e66a2f76088332f93ded26cae10096a4116 Mon Sep 17 00:00:00 2001 From: "tim.reichard" Date: Tue, 11 Jan 2022 14:36:49 -0600 Subject: [PATCH] Adding new EL3 field parsing logic --- .pre-commit-config.yaml | 4 +- HISTORY.rst | 15 ++ Makefile | 2 +- aioradio/file_ingestion.py | 292 +++++++++++++++++++++++++++++++++++-- aioradio/requirements.txt | 20 +-- setup.py | 16 +- 6 files changed, 315 insertions(+), 34 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3eb679b..0915436 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ default_language_version: python: python3.9 repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.4.0 + rev: v4.1.0 hooks: - id: check-added-large-files - id: check-ast @@ -15,7 +15,7 @@ repos: - id: requirements-txt-fixer - id: trailing-whitespace - repo: https://github.com/PyCQA/isort - rev: 5.8.0 + rev: 5.10.1 hooks: - id: isort - repo: https://github.com/myint/docformatter diff --git a/HISTORY.rst b/HISTORY.rst index 231627b..fce12e1 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -3,6 +3,21 @@ History ======= +v0.16.0 (2022-01-11) + +* Update ddtrace==0.57.0. +* Update moto==2.3.1. +* Update numpy==1.22.0. +* Update orjson==3.6.5 +* Update pre-commit==2.16.0. +* Update psycopg2-binary==2.9.3. +* Update pylint==2.12.2. +* Update pytest-asyncio==0.16.0. +* Update twine==3.7.1. +* Update wheel==0.37.1 +* Add support for EL3 field parsing. + + v0.15.6 (2021-11-30) * Hard-code redis==3.5.3. diff --git a/Makefile b/Makefile index 3ab07a6..9ba52b9 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ lint: test: . env/bin/activate; \ export AWS_PROFILE=sandbox; \ - pytest -vss --cov=aioradio --cov-config=.coveragerc --cov-report=html --cov-fail-under=60 + pytest -vss --cov=aioradio --cov-config=.coveragerc --cov-report=html --cov-fail-under=50 pre-commit: . env/bin/activate; \ diff --git a/aioradio/file_ingestion.py b/aioradio/file_ingestion.py index f08fe71..6092f07 100644 --- a/aioradio/file_ingestion.py +++ b/aioradio/file_ingestion.py @@ -75,8 +75,8 @@ def __post_init__(self): if not self.entry_year_filter: self.entry_year_filter = { - "start": "2021", - "end": "2025" + "start": "2022", + "end": "2026" } now = datetime.now() @@ -91,7 +91,14 @@ def __post_init__(self): "Enrolled": (now - timedelta(days=50 * 365), now + timedelta(days=365)), "Canceled": (now - timedelta(days=50 * 365), now + timedelta(days=365)), "Dropped": (now - timedelta(days=50 * 365), now + timedelta(days=365)), - "Graduated": (now - timedelta(days=50 * 365), now + timedelta(days=365)) + "Graduated": (now - timedelta(days=50 * 365), now + timedelta(days=365)), + "ProspectDate": (now - timedelta(days=50 * 365), now + timedelta(days=365)), + "FAFSASubmitted": (now - timedelta(days=50 * 365), now + timedelta(days=365)), + "CustomDate1": (now - timedelta(days=50 * 365), now + timedelta(days=365)), + "CustomDate2": (now - timedelta(days=50 * 365), now + timedelta(days=365)), + "CustomDate3": (now - timedelta(days=50 * 365), now + timedelta(days=365)), + "CustomDate4": (now - timedelta(days=50 * 365), now + timedelta(days=365)), + "CustomDate5": (now - timedelta(days=50 * 365), now + timedelta(days=365)) } self.filed_year_min_max = { @@ -128,7 +135,40 @@ def __post_init__(self): "AcademicProgram": 256, "StudentAthlete": 50, "CampusLocation": 50, - "Email": 75 + "Email": 75, + "CellPhoneNumber": 10, + "TextMessageOptIn": 5, + "HomePhone": 10, + "Ethnicity": 1, + "FirstGenFlag": 1, + "EFC": 20, + "HSCode": 6, + "ACTScore": 2, + "SATScore": 4, + "ProspectCode": 15, + "ProspectDate": 10, + "FAFSASubmitted": 10, + "ApplicationPlan": 30, + "AdmitCode": 20, + "College": 30, + "AdmittedProgram": 30, + "HonorsProgram": 5, + "StudentType": 20, + "International": 5, + "CountryOfOrigin": 30, + "StudentStatus": 20, + "Territory": 20, + "EngagementScore": 10, + "CustomFilter1": 20, + "CustomFilter2": 20, + "CustomFilter3": 20, + "CustomFilter4": 20, + "CustomFilter5": 20, + "CustomDate1": 10, + "CustomDate2": 10, + "CustomDate3": 10, + "CustomDate4": 10, + "CustomDate5": 10 } self.gender_map = { @@ -362,6 +402,19 @@ def __post_init__(self): 'prospects': 0 } + self.generic_bool_map = { + 'YES': 'Y', + 'NO': 'N', + 'Y': 'Y', + 'N': 'N', + 'TRUE': 'Y', + 'FALSE': 'N', + '1': 'Y', + '0': 'N' + } + + self.ethnicity_federal_categories = {'1', '2', '3', '4', '5', '6', '7', '8'} + def check_width(self, value: str, field: str, row_idx: int) -> str: """Check field value and truncate if it is longer than expected. @@ -411,8 +464,7 @@ def check_gender(self, value: str) -> str: """ if value != '': - value_upper = value.upper() - value = self.gender_map[value_upper] if value_upper in self.gender_map else '' + value = self.gender_map.get(value.upper(), '') return value @@ -432,8 +484,7 @@ def check_gpa(self, value: str, field: str, row_idx: int) -> str: try: value = '' if not (0 <= float(value) <= 200) else self.check_width(value, field, row_idx) except ValueError: - value_upper = value.upper() - value = self.grades_map[value_upper] if value_upper in self.grades_map else '' + value = self.grades_map.get(value.upper(), '') return value @@ -450,9 +501,7 @@ def check_statecode(self, value: str, field: str, row_idx: int) -> str: """ if value != '': - value_upper = value.upper() - if value_upper in self.state_to_statecode: - value = self.state_to_statecode[value_upper] + value = self.state_to_statecode.get(value.upper(), value) value = self.check_width(value, field, row_idx) return value @@ -576,8 +625,7 @@ def check_athlete(self, value: str) -> str: """ if value != '': - value_upper = value.upper() - value = self.student_athlete_map[value_upper] if value_upper in self.student_athlete_map else 'Y' + value = self.student_athlete_map.get(value.upper(), 'Y') return value @@ -799,10 +847,228 @@ def check_for_prospects(self, row: dict[str, Any]) -> bool: return skip_record + + ############################################################################################### + ############################### New EL3 field parsing functions ############################### + ############################################################################################### + # + # CELLPHONENUMBER + # TEXTMESSAGEOPTIN + # HOMEPHONE + # ETHNICITY + # FIRSTGENFLAG + # EFC + # HSCODE + # ACTSCORE + # SATSCORE + # PROSPECTCODE + # PROSPECTDATE + # FAFSASUBMITTED + # APPLICATIONPLAN + # ADMITCODE + # COLLEGE + # ADMITTEDPROGRAM + # HONORSPROGRAM + # STUDENTTYPE + # INTERNATIONAL + # COUNTRYOFORIGIN + # STUDENTSTATUS + # TERRITORY + # ENGAGEMENTSCORE + # CUSTOMFILTER1, ..., CUSTOMFILTER5 + # CUSTOMDATE1, ..., CUSTOMDATE5 + # + # Many of these fields are parsed using the functions check_generic or check_date + # else they use a function below. + + def check_generic_boolean(self, value: str) -> str: + """Check generic boolean value. + + Args: + value (str): Generic Boolean value + + Returns: + str: Generic Boolean value + """ + + if value != '': + value = self.generic_bool_map.get(value.upper(), '') + + return value + + def check_phone_number(self, value: str, field: str, row_idx: int) -> str: + """Check Cell/Home phone number logic. + + Args: + value (str): Cell/Home phone number value + field (str): Column header field value + row_idx (int): Row number in file + + Returns: + str: Cell/Home phone number value + """ + + if value != '': + value = ''.join(n for n in value if n.isdigit()) + value = self.check_width(value, field, row_idx) + + return value + + def check_ethnicity(self, value: str) -> str: + """Check Ethnicity is a federal category value. + + Args: + value (str): Ethnicity category + + Returns: + str: Ethnicity category + """ + + if value != '' and value not in self.ethnicity_federal_categories: + value = '' + + return value + + @staticmethod + def check_act_score(value: str) -> str: + """Check ACT Score logic. + + Args: + value (str): ACT score + field (str): Column header field value + + Returns: + str: ACT score + """ + + if value != '': + try: + integer = int(value) + value = str(integer) if (1 <= integer <= 36) else '' + except ValueError: + value = '' + + return value + + @staticmethod + def check_sat_score(value: str) -> str: + """Check SAT Score logic. + + Args: + value (str): SAT score + field (str): Column header field value + + Returns: + str: SAT score + """ + + if value != '': + try: + integer = int(value) + value = str(integer) if (400 <= integer <= 1600) else '' + except ValueError: + value = '' + + return value + + @staticmethod + def check_hscode(value: str) -> str: + """Check HSCODE logic. + + Args: + value (str): HSCODE value + field (str): Column header field value + + Returns: + str: HSCODE value + """ + + if value != '' and len(value) == 6: + try: + _ = int(value) + except ValueError: + value = '' + else: + value = '' + + return value + ############################################################################################### ################################### Used by EFI exclusively ################################### ############################################################################################### + def check_generic_boolean_efi(self, records: list[str]): + """Check generic boolean logic. + + Args: + records (list[str]): List of a specific columns values + field (str): Column header field value + row_idx (int): Row number in file + """ + + for idx in range(len(records)): + records[idx] = self.check_generic_boolean(records[idx]) + + def check_phone_number_efi(self, records: list[str], field: str, row_idx: int): + """Check phone number logic. + + Args: + records (list[str]): List of a specific columns values + field (str): Column header field value + row_idx (int): Row number in file + """ + + for idx in range(len(records)): + records[idx] = self.check_phone_number(records[idx], field, row_idx + idx) + + def check_ethnicity_efi(self, records: list[str]): + """Check ethnicity logic. + + Args: + records (list[str]): List of a specific columns values + field (str): Column header field value + row_idx (int): Row number in file + """ + + for idx in range(len(records)): + records[idx] = self.check_ethnicity(records[idx]) + + def check_act_score_efi(self, records: list[str]): + """Check ACT score logic. + + Args: + records (list[str]): List of a specific columns values + field (str): Column header field value + row_idx (int): Row number in file + """ + + for idx in range(len(records)): + records[idx] = self.check_act_score(records[idx]) + + def check_sat_score_efi(self, records: list[str]): + """Check SAT score logic. + + Args: + records (list[str]): List of a specific columns values + field (str): Column header field value + row_idx (int): Row number in file + """ + + for idx in range(len(records)): + records[idx] = self.check_sat_score(records[idx]) + + def check_hscode_efi(self, records: list[str]): + """Check HSCode logic. + + Args: + records (list[str]): List of a specific columns values + field (str): Column header field value + row_idx (int): Row number in file + """ + + for idx in range(len(records)): + records[idx] = self.check_hscode(records[idx]) + def check_year_efi(self, records: list[str], field: str, row_idx: int): """Check year conforms to expected year within time range. diff --git a/aioradio/requirements.txt b/aioradio/requirements.txt index 7f8763b..93b4279 100644 --- a/aioradio/requirements.txt +++ b/aioradio/requirements.txt @@ -2,24 +2,24 @@ aioboto3==9.2.2 aiobotocore==1.4.2 aiojobs==0.3.0 boto3==1.17.106 -ddtrace==0.56.0 +ddtrace==0.57.0 fakeredis==1.7.0 flask==2.0.2 flask-cors==3.0.10 httpx==0.20.0 mandrill==1.0.60 -moto==2.2.17 -numpy==1.21.4 -orjson==3.6.4 -pre-commit==2.15.0 -psycopg2-binary==2.9.2 -pylint==2.12.1 +moto==2.3.1 +numpy==1.22.0 +orjson==3.6.5 +pre-commit==2.16.0 +psycopg2-binary==2.9.3 +pylint==2.12.2 pyodbc==4.0.32 pysmb==1.2.7 pytest==6.2.5 -pytest-asyncio==0.15.1 +pytest-asyncio==0.16.0 pytest-cov==3.0.0 python-json-logger==2.0.2 redis==3.5.3 -twine==3.6.0 -wheel==0.37.0 +twine==3.7.1 +wheel==0.37.1 diff --git a/setup.py b/setup.py index f0ef0ab..2673622 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ long_description = fileobj.read() setup(name='aioradio', - version='0.15.6', + version='0.16.0', description='Generic asynchronous i/o python utilities for AWS services (SQS, S3, DynamoDB, Secrets Manager), Redis, MSSQL (pyodbc), JIRA and more', long_description=long_description, long_description_content_type="text/markdown", @@ -23,13 +23,13 @@ 'aiobotocore>=1.4.2', 'aiojobs>=0.3.0', 'boto3==1.17.106', - 'ddtrace>=0.56.0', + 'ddtrace>=0.57.0', 'fakeredis>=1.7.0', 'httpx>=0.19.0', 'mandrill>=1.0.60', 'numpy>=1.19', - 'orjson>=3.6.4', - 'psycopg2-binary==2.9.2', + 'orjson>=3.6.5', + 'psycopg2-binary==2.9.3', 'pysmb>=1.2.7', 'python-json-logger>=2.0.2', 'redis==3.5.3', @@ -39,11 +39,11 @@ tests_require=[ 'flask>=2.0.2', 'flask-cors>=3.0.10', - 'moto>=2.2.9', - 'pre-commit>=2.15.0', - 'pylint>=2.11.1', + 'moto>=2.3.1', + 'pre-commit>=2.16.0', + 'pylint>=2.11.2', 'pytest>=6.2.5', - 'pytest-asyncio>=0.15.1', + 'pytest-asyncio>=0.16.0', 'pytest-cov>=3.0.0' ], zip_safe=False,