From b83224fb0a886e67164e3c8f7034831de0a5d380 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 11:10:41 -0400 Subject: [PATCH 1/9] Implemented `Processor` class and refactored some preliminary processing A `Processor` class has been added, subclassing from the `Invoice` class. This is the first step to refactor invoicing in order to seperate the processing and filtering/exporting functionalities of our current Invoice subclasses. Subclasses of `Processor` should only process invoices and manipulate its internal data, while subclasses of `Invoice` should only perform fitering and exporting, never changing any data itself. In addition to implementing `Processor`, two of its subclasses, `ValidatePIAliasProcessor` and `AddInstitutionProcessor` has been added to perform some preliminary processing steps. --- process_report/process_report.py | 81 +++++-------------- .../processors/add_institution_processor.py | 58 +++++++++++++ process_report/processors/processor.py | 8 ++ .../processors/validate_pi_alias_processor.py | 23 ++++++ process_report/tests/unit_tests.py | 27 ++++--- process_report/tests/util.py | 20 +++++ process_report/util.py | 18 ----- 7 files changed, 145 insertions(+), 90 deletions(-) create mode 100644 process_report/processors/add_institution_processor.py create mode 100644 process_report/processors/processor.py create mode 100644 process_report/processors/validate_pi_alias_processor.py diff --git a/process_report/process_report.py b/process_report/process_report.py index 3f309a2..75e0078 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -3,7 +3,6 @@ import sys import datetime -import json import pandas import pyarrow @@ -15,7 +14,10 @@ NERC_total_invoice, bu_internal_invoice, ) - +from process_report.processors import ( + validate_pi_alias_processor, + add_institution_processor, +) ### PI file field names PI_PI_FIELD = "PI" @@ -51,26 +53,6 @@ ALIAS_S3_FILEPATH = "PIs/alias.csv" -def get_institution_from_pi(institute_map, pi_uname): - institution_domain = pi_uname.split("@")[-1] - for i in range(institution_domain.count(".") + 1): - if institution_name := institute_map.get(institution_domain, ""): - break - institution_domain = institution_domain[institution_domain.find(".") + 1 :] - - if institution_name == "": - print(f"Warning: PI name {pi_uname} does not match any institution!") - - return institution_name - - -def load_institute_map() -> dict: - with open("process_report/institute_map.json", "r") as f: - institute_map = json.load(f) - - return institute_map - - def load_alias(alias_file): alias_dict = dict() @@ -220,15 +202,27 @@ def main(): projects = list(set(projects + timed_projects_list)) - merged_dataframe = validate_pi_aliases(merged_dataframe, alias_dict) - merged_dataframe = add_institution(merged_dataframe) + ### Do some preliminary processing + + validate_pi_alias_proc = validate_pi_alias_processor.ValidatePIAliasProcessor( + "", invoice_month, merged_dataframe, alias_dict + ) + validate_pi_alias_proc.process() + + add_institute_proc = add_institution_processor.AddInstitutionProcessor( + "", invoice_month, validate_pi_alias_proc.data + ) + add_institute_proc.process() + + ### Finish preliminary processing + lenovo_inv = lenovo_invoice.LenovoInvoice( - name=args.Lenovo_file, invoice_month=invoice_month, data=merged_dataframe.copy() + name=args.Lenovo_file, invoice_month=invoice_month, data=add_institute_proc.data ) nonbillable_inv = nonbillable_invoice.NonbillableInvoice( name=args.nonbillable_file, invoice_month=invoice_month, - data=merged_dataframe.copy(), + data=add_institute_proc.data, nonbillable_pis=pi, nonbillable_projects=projects, ) @@ -239,7 +233,7 @@ def main(): billable_inv = billable_invoice.BillableInvoice( name=args.output_file, invoice_month=invoice_month, - data=merged_dataframe.copy(), + data=add_institute_proc.data.copy(), nonbillable_pis=pi, nonbillable_projects=projects, old_pi_filepath=old_pi_file, @@ -334,13 +328,6 @@ def timed_projects(timed_projects_file, invoice_date): return dataframe[mask]["Project"].to_list() -def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict): - for pi, pi_aliases in alias_dict.items(): - dataframe.loc[dataframe[PI_FIELD].isin(pi_aliases), PI_FIELD] = pi - - return dataframe - - def fetch_s3_alias_file(): local_name = "alias.csv" invoice_bucket = get_invoice_bucket() @@ -360,32 +347,6 @@ def backup_to_s3_old_pi_file(old_pi_file): invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv") -def add_institution(dataframe: pandas.DataFrame): - """Determine every PI's institution name, logging any PI whose institution cannot be determined - This is performed by `get_institution_from_pi()`, which tries to match the PI's username to - a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if - the username is not an email address. - - Exact matches are then mapped to the corresponding institution name. - - I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University" - - The list of mappings are defined in `institute_map.json`. - """ - institute_map = load_institute_map() - dataframe = dataframe.astype({INSTITUTION_FIELD: "str"}) - for i, row in dataframe.iterrows(): - pi_name = row[PI_FIELD] - if pandas.isna(pi_name): - print(f"Project {row[PROJECT_FIELD]} has no PI") - else: - dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi( - institute_map, pi_name - ) - - return dataframe - - def export_billables(dataframe, output_file): dataframe.to_csv(output_file, index=False) diff --git a/process_report/processors/add_institution_processor.py b/process_report/processors/add_institution_processor.py new file mode 100644 index 0000000..0d7e94b --- /dev/null +++ b/process_report/processors/add_institution_processor.py @@ -0,0 +1,58 @@ +from dataclasses import dataclass +import json + +import pandas + +from process_report.invoices import invoice +from process_report.processors import processor + + +@dataclass +class AddInstitutionProcessor(processor.Processor): + @staticmethod + def _load_institute_map() -> dict: + with open("process_report/institute_map.json", "r") as f: + institute_map = json.load(f) + + return institute_map + + @staticmethod + def _get_institution_from_pi(institute_map, pi_uname): + institution_domain = pi_uname.split("@")[-1] + for i in range(institution_domain.count(".") + 1): + if institution_name := institute_map.get(institution_domain, ""): + break + institution_domain = institution_domain[institution_domain.find(".") + 1 :] + + if institution_name == "": + print(f"Warning: PI name {pi_uname} does not match any institution!") + + return institution_name + + def _add_institution(self, dataframe: pandas.DataFrame): + """Determine every PI's institution name, logging any PI whose institution cannot be determined + This is performed by `get_institution_from_pi()`, which tries to match the PI's username to + a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if + the username is not an email address. + + Exact matches are then mapped to the corresponding institution name. + + I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University" + + The list of mappings are defined in `institute_map.json`. + """ + institute_map = self._load_institute_map() + dataframe = dataframe.astype({invoice.INSTITUTION_FIELD: "str"}) + for i, row in dataframe.iterrows(): + pi_name = row[invoice.PI_FIELD] + if pandas.isna(pi_name): + print(f"Project {row[invoice.PROJECT_FIELD]} has no PI") + else: + dataframe.at[ + i, invoice.INSTITUTION_FIELD + ] = self._get_institution_from_pi(institute_map, pi_name) + + return dataframe + + def _process(self): + self.data = self._add_institution(self.data) diff --git a/process_report/processors/processor.py b/process_report/processors/processor.py new file mode 100644 index 0000000..5a88fd6 --- /dev/null +++ b/process_report/processors/processor.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + +from process_report.invoices import invoice + + +@dataclass +class Processor(invoice.Invoice): + pass diff --git a/process_report/processors/validate_pi_alias_processor.py b/process_report/processors/validate_pi_alias_processor.py new file mode 100644 index 0000000..a702858 --- /dev/null +++ b/process_report/processors/validate_pi_alias_processor.py @@ -0,0 +1,23 @@ +from dataclasses import dataclass + +import pandas + +from process_report.invoices import invoice +from process_report.processors import processor + + +@dataclass +class ValidatePIAliasProcessor(processor.Processor): + alias_map: dict + + @staticmethod + def _validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict): + for pi, pi_aliases in alias_dict.items(): + dataframe.loc[ + dataframe[invoice.PI_FIELD].isin(pi_aliases), invoice.PI_FIELD + ] = pi + + return dataframe + + def _process(self): + self.data = self._validate_pi_aliases(self.data, self.alias_map) diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 411b55a..717b630 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -218,7 +218,7 @@ def test_export_pi(self): self.assertNotIn("ProjectC", pi_df["Project - Allocation"].tolist()) -class TestGetInstitute(TestCase): +class TestAddInstituteProcessor(TestCase): def test_get_pi_institution(self): institute_map = { "harvard.edu": "Harvard University", @@ -247,31 +247,34 @@ def test_get_pi_institution(self): "g@bidmc.harvard.edu": "Beth Israel Deaconess Medical Center", } + add_institute_proc = test_utils.new_add_institution_processor() + for pi_email, answer in answers.items(): self.assertEqual( - process_report.get_institution_from_pi(institute_map, pi_email), answer + add_institute_proc._get_institution_from_pi(institute_map, pi_email), + answer, ) -class TestAlias(TestCase): - def setUp(self): - self.alias_dict = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]} - - self.data = pandas.DataFrame( +class TestValidateAliasProcessor(TestCase): + def test_validate_alias(self): + alias_map = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]} + test_data = pandas.DataFrame( { "Manager (PI)": ["PI1", "PI1_1", "PI1_2", "PI2_1", "PI2_1"], } ) - - self.answer = pandas.DataFrame( + answer_data = pandas.DataFrame( { "Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"], } ) - def test_validate_alias(self): - output = process_report.validate_pi_aliases(self.data, self.alias_dict) - self.assertTrue(self.answer.equals(output)) + validate_pi_alias_proc = test_utils.new_validate_pi_alias_processor( + data=test_data, alias_map=alias_map + ) + validate_pi_alias_proc.process() + self.assertTrue(answer_data.equals(validate_pi_alias_proc.data)) class TestMonthUtils(TestCase): diff --git a/process_report/tests/util.py b/process_report/tests/util.py index 0d3d3d6..e1f333f 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -1,5 +1,9 @@ import pandas +from process_report.processors import ( + add_institution_processor, + validate_pi_alias_processor, +) from process_report.invoices import billable_invoice, bu_internal_invoice @@ -27,3 +31,19 @@ def new_bu_internal_invoice( return bu_internal_invoice.BUInternalInvoice( name, invoice_month, data, subsidy_amount ) + + +def new_add_institution_processor( + name="", + invoice_month="0000-00", + data=pandas.DataFrame(), +): + return add_institution_processor.AddInstitutionProcessor(name, invoice_month, data) + + +def new_validate_pi_alias_processor( + name="", invoice_month="0000-00", data=pandas.DataFrame(), alias_map={} +): + return validate_pi_alias_processor.ValidatePIAliasProcessor( + name, invoice_month, data, alias_map + ) diff --git a/process_report/util.py b/process_report/util.py index 45b9ed4..5c4f190 100644 --- a/process_report/util.py +++ b/process_report/util.py @@ -1,6 +1,5 @@ import os import datetime -import json import logging import functools @@ -27,23 +26,6 @@ def get_invoice_bucket(): return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing")) -def get_institution_from_pi(institute_map, pi_uname): - institution_key = pi_uname.split("@")[-1] - institution_name = institute_map.get(institution_key, "") - - if institution_name == "": - logger.warn(f"PI name {pi_uname} does not match any institution!") - - return institution_name - - -def load_institute_map() -> dict: - with open("process_report/institute_map.json", "r") as f: - institute_map = json.load(f) - - return institute_map - - def get_iso8601_time(): return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ") From a1a5992bc6387ea4d00e5599541282f69f2dcd3d Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 15:28:46 -0400 Subject: [PATCH 2/9] Initialized processor for Lenovo processing --- process_report/processors/lenovo_processor.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 process_report/processors/lenovo_processor.py diff --git a/process_report/processors/lenovo_processor.py b/process_report/processors/lenovo_processor.py new file mode 100644 index 0000000..2a4b162 --- /dev/null +++ b/process_report/processors/lenovo_processor.py @@ -0,0 +1,30 @@ +from dataclasses import dataclass + + +from process_report.invoices import invoice +from process_report.processors import processor + + +@dataclass +class LenovoProcessor(processor.Processor): + LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] + SU_CHARGE_MULTIPLIER = 1 + + def _prepare(self): + self.data = self.data[ + self.data[invoice.SU_TYPE_FIELD].isin(self.LENOVO_SU_TYPES) + ][ + [ + invoice.INVOICE_DATE_FIELD, + invoice.PROJECT_FIELD, + invoice.INSTITUTION_FIELD, + invoice.SU_HOURS_FIELD, + invoice.SU_TYPE_FIELD, + ] + ].copy() + + self.data.rename(columns={invoice.SU_HOURS_FIELD: "SU Hours"}, inplace=True) + self.data.insert(len(self.data.columns), "SU Charge", self.SU_CHARGE_MULTIPLIER) + + def _process(self): + self.data["Charge"] = self.data["SU Hours"] * self.data["SU Charge"] From 0b7d98037259481b2c08ff08fad7bffc23ddee14 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 15:53:11 -0400 Subject: [PATCH 3/9] Implemented processor for Lenovo processing Note that, for now, only the Lenovo invoice will take the processed data from the `LenovoProcessor`. All other invoices will take the data from `AddInstituteProcessor`. This is due to the processors adding new columns. This odd code design will be removed once invoices gain the feature to filter out their exported columns. --- process_report/invoices/lenovo_invoice.py | 7 +- process_report/process_report.py | 8 ++- process_report/processors/lenovo_processor.py | 20 +----- process_report/tests/unit_tests.py | 69 ++++--------------- process_report/tests/util.py | 5 ++ 5 files changed, 29 insertions(+), 80 deletions(-) diff --git a/process_report/invoices/lenovo_invoice.py b/process_report/invoices/lenovo_invoice.py index fa3355f..ea47d75 100644 --- a/process_report/invoices/lenovo_invoice.py +++ b/process_report/invoices/lenovo_invoice.py @@ -6,9 +6,8 @@ @dataclass class LenovoInvoice(invoice.Invoice): LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] - SU_CHARGE_MULTIPLIER = 1 - def _prepare(self): + def _prepare_export(self): self.data = self.data[ self.data[invoice.SU_TYPE_FIELD].isin(self.LENOVO_SU_TYPES) ][ @@ -22,7 +21,3 @@ def _prepare(self): ].copy() self.data.rename(columns={invoice.SU_HOURS_FIELD: "SU Hours"}, inplace=True) - self.data.insert(len(self.data.columns), "SU Charge", self.SU_CHARGE_MULTIPLIER) - - def _process(self): - self.data["Charge"] = self.data["SU Hours"] * self.data["SU Charge"] diff --git a/process_report/process_report.py b/process_report/process_report.py index 75e0078..1ce70b5 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -17,6 +17,7 @@ from process_report.processors import ( validate_pi_alias_processor, add_institution_processor, + lenovo_processor, ) ### PI file field names @@ -214,10 +215,15 @@ def main(): ) add_institute_proc.process() + lenovo_proc = lenovo_processor.LenovoProcessor( + "", invoice_month, add_institute_proc.data + ) + lenovo_proc.process() + ### Finish preliminary processing lenovo_inv = lenovo_invoice.LenovoInvoice( - name=args.Lenovo_file, invoice_month=invoice_month, data=add_institute_proc.data + name=args.Lenovo_file, invoice_month=invoice_month, data=lenovo_proc.data ) nonbillable_inv = nonbillable_invoice.NonbillableInvoice( name=args.nonbillable_file, diff --git a/process_report/processors/lenovo_processor.py b/process_report/processors/lenovo_processor.py index 2a4b162..808e820 100644 --- a/process_report/processors/lenovo_processor.py +++ b/process_report/processors/lenovo_processor.py @@ -7,24 +7,8 @@ @dataclass class LenovoProcessor(processor.Processor): - LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] SU_CHARGE_MULTIPLIER = 1 - def _prepare(self): - self.data = self.data[ - self.data[invoice.SU_TYPE_FIELD].isin(self.LENOVO_SU_TYPES) - ][ - [ - invoice.INVOICE_DATE_FIELD, - invoice.PROJECT_FIELD, - invoice.INSTITUTION_FIELD, - invoice.SU_HOURS_FIELD, - invoice.SU_TYPE_FIELD, - ] - ].copy() - - self.data.rename(columns={invoice.SU_HOURS_FIELD: "SU Hours"}, inplace=True) - self.data.insert(len(self.data.columns), "SU Charge", self.SU_CHARGE_MULTIPLIER) - def _process(self): - self.data["Charge"] = self.data["SU Hours"] * self.data["SU Charge"] + self.data["SU Charge"] = self.SU_CHARGE_MULTIPLIER + self.data["Charge"] = self.data[invoice.SU_HOURS_FIELD] * self.data["SU Charge"] diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 717b630..1fc7002 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -8,7 +8,7 @@ from textwrap import dedent from process_report import process_report, util -from process_report.invoices import lenovo_invoice, nonbillable_invoice +from process_report.invoices import nonbillable_invoice from process_report.tests import util as test_utils @@ -731,63 +731,22 @@ def test_validate_billables(self): ) -class TestExportLenovo(TestCase): - def setUp(self): - data = { - "Invoice Month": [ - "2023-01", - "2023-01", - "2023-01", - "2023-01", - "2023-01", - "2023-01", - ], - "Project - Allocation": [ - "ProjectA", - "ProjectB", - "ProjectC", - "ProjectD", - "ProjectE", - "ProjectF", - ], - "Institution": ["A", "B", "C", "D", "E", "F"], - "SU Hours (GBhr or SUhr)": [1, 10, 100, 4, 432, 10], - "SU Type": [ - "OpenShift GPUA100SXM4", - "OpenShift GPUA100", - "OpenShift GPUA100SXM4", - "OpenStack GPUA100SXM4", - "OpenStack CPU", - "OpenStack GPUK80", - ], - } - self.lenovo_invoice = lenovo_invoice.LenovoInvoice( - "Lenovo", "2023-01", pandas.DataFrame(data) - ) - self.lenovo_invoice.process() - +class TestLenovoProcessor(TestCase): def test_process_lenovo(self): - output_df = self.lenovo_invoice.data - self.assertTrue( - set( - [ - process_report.INVOICE_DATE_FIELD, - process_report.PROJECT_FIELD, - process_report.INSTITUTION_FIELD, - process_report.SU_TYPE_FIELD, - "SU Hours", - "SU Charge", - "Charge", - ] - ).issubset(output_df) + test_invoice = pandas.DataFrame( + { + "SU Hours (GBhr or SUhr)": [1, 10, 100, 4, 432, 10], + } + ) + answer_invoice = test_invoice.copy() + answer_invoice["SU Charge"] = 1 + answer_invoice["Charge"] = ( + answer_invoice["SU Hours (GBhr or SUhr)"] * answer_invoice["SU Charge"] ) - for i, row in output_df.iterrows(): - self.assertIn( - row[process_report.SU_TYPE_FIELD], - ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"], - ) - self.assertEqual(row["Charge"], row["SU Charge"] * row["SU Hours"]) + lenovo_proc = test_utils.new_lenovo_processor(data=test_invoice) + lenovo_proc.process() + self.assertTrue(lenovo_proc.data.equals(answer_invoice)) class TestUploadToS3(TestCase): diff --git a/process_report/tests/util.py b/process_report/tests/util.py index e1f333f..5cf24a1 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -3,6 +3,7 @@ from process_report.processors import ( add_institution_processor, validate_pi_alias_processor, + lenovo_processor, ) from process_report.invoices import billable_invoice, bu_internal_invoice @@ -47,3 +48,7 @@ def new_validate_pi_alias_processor( return validate_pi_alias_processor.ValidatePIAliasProcessor( name, invoice_month, data, alias_map ) + + +def new_lenovo_processor(name="", invoice_month="0000-00", data=pandas.DataFrame()): + return lenovo_processor.LenovoProcessor(name, invoice_month, data) From a75e5325276e02cf49a2e2fc27abb13fbe65671b Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 11:51:47 -0400 Subject: [PATCH 4/9] Implemented processors for removing nonbillables and validating billable PIs --- process_report/invoices/billable_invoice.py | 26 ------ process_report/process_report.py | 22 ++++- .../remove_nonbillables_processor.py | 28 ++++++ .../validate_billable_pi_processor.py | 25 ++++++ process_report/tests/unit_tests.py | 85 +++++++++---------- process_report/tests/util.py | 26 +++++- 6 files changed, 134 insertions(+), 78 deletions(-) create mode 100644 process_report/processors/remove_nonbillables_processor.py create mode 100644 process_report/processors/validate_billable_pi_processor.py diff --git a/process_report/invoices/billable_invoice.py b/process_report/invoices/billable_invoice.py index 0cf7d3e..320c0e9 100644 --- a/process_report/invoices/billable_invoice.py +++ b/process_report/invoices/billable_invoice.py @@ -20,8 +20,6 @@ class BillableInvoice(discount_invoice.DiscountInvoice): EXCLUDE_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] PI_S3_FILEPATH = "PIs/PI.csv" - nonbillable_pis: list[str] - nonbillable_projects: list[str] old_pi_filepath: str @staticmethod @@ -42,26 +40,6 @@ def _load_old_pis(old_pi_filepath) -> pandas.DataFrame: return old_pi_df - @staticmethod - def _remove_nonbillables( - data: pandas.DataFrame, - nonbillable_pis: list[str], - nonbillable_projects: list[str], - ): - return data[ - ~data[invoice.PI_FIELD].isin(nonbillable_pis) - & ~data[invoice.PROJECT_FIELD].isin(nonbillable_projects) - ] - - @staticmethod - def _validate_pi_names(data: pandas.DataFrame): - invalid_pi_projects = data[pandas.isna(data[invoice.PI_FIELD])] - for i, row in invalid_pi_projects.iterrows(): - logger.warn( - f"Billable project {row[invoice.PROJECT_FIELD]} has empty PI field" - ) - return data[~pandas.isna(data[invoice.PI_FIELD])] - @staticmethod def _get_pi_age(old_pi_df: pandas.DataFrame, pi, invoice_month): """Returns time difference between current invoice month and PI's first invoice month @@ -82,10 +60,6 @@ def _get_pi_age(old_pi_df: pandas.DataFrame, pi, invoice_month): return month_diff def _prepare(self): - self.data = self._remove_nonbillables( - self.data, self.nonbillable_pis, self.nonbillable_projects - ) - self.data = self._validate_pi_names(self.data) self.data[invoice.CREDIT_FIELD] = None self.data[invoice.CREDIT_CODE_FIELD] = None self.data[invoice.BALANCE_FIELD] = self.data[invoice.COST_FIELD] diff --git a/process_report/process_report.py b/process_report/process_report.py index 1ce70b5..bea7983 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -18,6 +18,8 @@ validate_pi_alias_processor, add_institution_processor, lenovo_processor, + remove_nonbillables_processor, + validate_billable_pi_processor, ) ### PI file field names @@ -233,15 +235,29 @@ def main(): nonbillable_projects=projects, ) + ### Remove nonbillables + + remove_nonbillables_proc = remove_nonbillables_processor.RemoveNonbillables( + "", invoice_month, add_institute_proc.data, pi, projects + ) + remove_nonbillables_proc.process() + + validate_billable_pi_proc = ( + validate_billable_pi_processor.ValidateBillablePIsProcessor( + "", invoice_month, remove_nonbillables_proc.data + ) + ) + validate_billable_pi_proc.process() + + ### Initialize invoices + if args.upload_to_s3: backup_to_s3_old_pi_file(old_pi_file) billable_inv = billable_invoice.BillableInvoice( name=args.output_file, invoice_month=invoice_month, - data=add_institute_proc.data.copy(), - nonbillable_pis=pi, - nonbillable_projects=projects, + data=validate_billable_pi_proc.data.copy(), old_pi_filepath=old_pi_file, ) diff --git a/process_report/processors/remove_nonbillables_processor.py b/process_report/processors/remove_nonbillables_processor.py new file mode 100644 index 0000000..326c98b --- /dev/null +++ b/process_report/processors/remove_nonbillables_processor.py @@ -0,0 +1,28 @@ +from dataclasses import dataclass + +import pandas + +from process_report.invoices import invoice +from process_report.processors import processor + + +@dataclass +class RemoveNonbillables(processor.Processor): + nonbillable_pis: list[str] + nonbillable_projects: list[str] + + @staticmethod + def _remove_nonbillables( + data: pandas.DataFrame, + nonbillable_pis: list[str], + nonbillable_projects: list[str], + ): + return data[ + ~data[invoice.PI_FIELD].isin(nonbillable_pis) + & ~data[invoice.PROJECT_FIELD].isin(nonbillable_projects) + ] + + def _process(self): + self.data = self._remove_nonbillables( + self.data, self.nonbillable_pis, self.nonbillable_projects + ) diff --git a/process_report/processors/validate_billable_pi_processor.py b/process_report/processors/validate_billable_pi_processor.py new file mode 100644 index 0000000..72ab60a --- /dev/null +++ b/process_report/processors/validate_billable_pi_processor.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +import logging + +import pandas + +from process_report.invoices import invoice +from process_report.processors import processor + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +@dataclass +class ValidateBillablePIsProcessor(processor.Processor): + @staticmethod + def _validate_pi_names(data: pandas.DataFrame): + invalid_pi_projects = data[pandas.isna(data[invoice.PI_FIELD])] + for i, row in invalid_pi_projects.iterrows(): + logger.warn( + f"Billable project {row[invoice.PROJECT_FIELD]} has empty PI field" + ) + return data[~pandas.isna(data[invoice.PI_FIELD])] + + def _process(self): + self.data = self._validate_pi_names(self.data) diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 1fc7002..02f3909 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -104,26 +104,6 @@ def test_remove_billables(self): self.assertNotIn("ProjectE", result_df["Project - Allocation"].tolist()) -class TestBillableInvoice(TestCase): - def test_remove_nonbillables(self): - pis = [uuid.uuid4().hex for x in range(10)] - projects = [uuid.uuid4().hex for x in range(10)] - nonbillable_pis = pis[:3] - nonbillable_projects = projects[7:] - billable_pis = pis[3:7] - data = pandas.DataFrame({"Manager (PI)": pis, "Project - Allocation": projects}) - - test_invoice = test_utils.new_billable_invoice() - data = test_invoice._remove_nonbillables( - data, nonbillable_pis, nonbillable_projects - ) - self.assertTrue(data[data["Manager (PI)"].isin(nonbillable_pis)].empty) - self.assertTrue( - data[data["Project - Allocation"].isin(nonbillable_projects)].empty - ) - self.assertTrue(data.equals(data[data["Manager (PI)"].isin(billable_pis)])) - - class TestMergeCSV(TestCase): def setUp(self): self.header = ["ID", "Name", "Age"] @@ -277,6 +257,46 @@ def test_validate_alias(self): self.assertTrue(answer_data.equals(validate_pi_alias_proc.data)) +class TestRemoveNonbillablesProcessor(TestCase): + def test_remove_nonbillables(self): + pis = [uuid.uuid4().hex for x in range(10)] + projects = [uuid.uuid4().hex for x in range(10)] + nonbillable_pis = pis[:3] + nonbillable_projects = projects[7:] + billable_pis = pis[3:7] + data = pandas.DataFrame({"Manager (PI)": pis, "Project - Allocation": projects}) + + remove_nonbillables_proc = test_utils.new_remove_nonbillables_processor() + data = remove_nonbillables_proc._remove_nonbillables( + data, nonbillable_pis, nonbillable_projects + ) + self.assertTrue(data[data["Manager (PI)"].isin(nonbillable_pis)].empty) + self.assertTrue( + data[data["Project - Allocation"].isin(nonbillable_projects)].empty + ) + self.assertTrue(data.equals(data[data["Manager (PI)"].isin(billable_pis)])) + + +class TestValidateBillablePIProcessor(TestCase): + def test_validate_billables(self): + test_data = pandas.DataFrame( + { + "Manager (PI)": ["PI1", math.nan, "PI1", "PI2", "PI2"], + "Project - Allocation": [ + "ProjectA", + "ProjectB", + "ProjectC", + "ProjectD", + "ProjectE", + ], + } + ) + self.assertEqual(1, len(test_data[pandas.isna(test_data["Manager (PI)"])])) + validate_billable_pi_proc = test_utils.new_validate_billable_pi_processor() + output_data = validate_billable_pi_proc._validate_pi_names(test_data) + self.assertEqual(0, len(output_data[pandas.isna(output_data["Manager (PI)"])])) + + class TestMonthUtils(TestCase): def test_get_month_diff(self): testcases = [ @@ -706,31 +726,6 @@ def test_apply_BU_subsidy(self): self.assertEqual(50, output_df.loc[3, "Balance"]) -class TestValidateBillables(TestCase): - def setUp(self): - data = { - "Manager (PI)": ["PI1", math.nan, "PI1", "PI2", "PI2"], - "Project - Allocation": [ - "ProjectA", - "ProjectB", - "ProjectC", - "ProjectD", - "ProjectE", - ], - } - self.dataframe = pandas.DataFrame(data) - - def test_validate_billables(self): - self.assertEqual( - 1, len(self.dataframe[pandas.isna(self.dataframe["Manager (PI)"])]) - ) - test_invoice = test_utils.new_billable_invoice() - validated_df = test_invoice._validate_pi_names(self.dataframe) - self.assertEqual( - 0, len(validated_df[pandas.isna(validated_df["Manager (PI)"])]) - ) - - class TestLenovoProcessor(TestCase): def test_process_lenovo(self): test_invoice = pandas.DataFrame( diff --git a/process_report/tests/util.py b/process_report/tests/util.py index 5cf24a1..100a779 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -4,6 +4,8 @@ add_institution_processor, validate_pi_alias_processor, lenovo_processor, + remove_nonbillables_processor, + validate_billable_pi_processor, ) from process_report.invoices import billable_invoice, bu_internal_invoice @@ -12,16 +14,12 @@ def new_billable_invoice( name="", invoice_month="0000-00", data=pandas.DataFrame(), - nonbillable_pis=[], - nonbillable_projects=[], old_pi_filepath="", ): return billable_invoice.BillableInvoice( name, invoice_month, data, - nonbillable_pis, - nonbillable_projects, old_pi_filepath, ) @@ -52,3 +50,23 @@ def new_validate_pi_alias_processor( def new_lenovo_processor(name="", invoice_month="0000-00", data=pandas.DataFrame()): return lenovo_processor.LenovoProcessor(name, invoice_month, data) + + +def new_remove_nonbillables_processor( + name="", + invoice_month="0000-00", + data=pandas.DataFrame(), + nonbillable_pis=[], + nonbillable_projects=[], +): + return remove_nonbillables_processor.RemoveNonbillables( + name, invoice_month, data, nonbillable_pis, nonbillable_projects + ) + + +def new_validate_billable_pi_processor( + name="", invoice_month="0000-00", data=pandas.DataFrame() +): + return validate_billable_pi_processor.ValidateBillablePIsProcessor( + name, invoice_month, data + ) From e5d27e9b8dbf24cc49bbc00f2cc381a6f86ce738 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 17:47:03 -0400 Subject: [PATCH 5/9] Initialized processor for applying the New-PI Credit and discounts --- .../processors/discount_processor.py | 73 +++++++++ .../processors/new_pi_credit_processor.py | 151 ++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 process_report/processors/discount_processor.py create mode 100644 process_report/processors/new_pi_credit_processor.py diff --git a/process_report/processors/discount_processor.py b/process_report/processors/discount_processor.py new file mode 100644 index 0000000..25666bb --- /dev/null +++ b/process_report/processors/discount_processor.py @@ -0,0 +1,73 @@ +from dataclasses import dataclass +import pandas + +from process_report.processors import processor + + +@dataclass +class DiscountProcessor(processor.Processor): + @staticmethod + def apply_flat_discount( + invoice: pandas.DataFrame, + pi_projects: pandas.DataFrame, + discount_amount: int, + discount_field: str, + balance_field: str, + code_field: str = None, + discount_code: str = None, + ): + """ + Takes in an invoice and a list of PI projects that are a subset of it, + and applies a flat discount to those PI projects. Note that this function + will change the provided `invoice` Dataframe directly. Therefore, it does + not return the changed invoice. + + This function assumes that the balance field shows the remaining cost of the project, + or what the PI would pay before the flat discount is applied. + + If the optional parameters `code_field` and `discount_code` are passed in, + `discount_code` will be comma-APPENDED to the `code_field` of projects where + the discount is applied + + Returns the amount of discount used. + + :param invoice: Dataframe containing all projects + :param pi_projects: A subset of `invoice`, containing all projects for a PI you want to apply the discount + :param discount_amount: The discount given to the PI + :param discount_field: Name of the field to put the discount amount applied to each project + :param balance_field: Name of the balance field + :param code_field: Name of the discount code field + :param discount_code: Code of the discount + """ + + def apply_discount_on_project(remaining_discount_amount, project_i, project): + remaining_project_balance = project[balance_field] + applied_discount = min(remaining_project_balance, remaining_discount_amount) + invoice.at[project_i, discount_field] = applied_discount + invoice.at[project_i, balance_field] = ( + project[balance_field] - applied_discount + ) + remaining_discount_amount -= applied_discount + return remaining_discount_amount + + def apply_credit_code_on_project(project_i): + if code_field and discount_code: + if pandas.isna(invoice.at[project_i, code_field]): + invoice.at[project_i, code_field] = discount_code + else: + invoice.at[project_i, code_field] = ( + invoice.at[project_i, code_field] + "," + discount_code + ) + + remaining_discount_amount = discount_amount + for i, row in pi_projects.iterrows(): + if remaining_discount_amount == 0: + break + else: + remaining_discount_amount = apply_discount_on_project( + remaining_discount_amount, i, row + ) + apply_credit_code_on_project(i) + + discount_used = discount_amount - remaining_discount_amount + return discount_used diff --git a/process_report/processors/new_pi_credit_processor.py b/process_report/processors/new_pi_credit_processor.py new file mode 100644 index 0000000..7e45ebf --- /dev/null +++ b/process_report/processors/new_pi_credit_processor.py @@ -0,0 +1,151 @@ +import sys + +from dataclasses import dataclass +import pandas +import pyarrow + +from process_report import util +from process_report.invoices import invoice +from process_report.processors import discount_processor + + +@dataclass +class NewPICreditProcessor(discount_processor.DiscountProcessor): + NEW_PI_CREDIT_CODE = "0002" + INITIAL_CREDIT_AMOUNT = 1000 + EXCLUDE_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] + + old_pi_filepath: str + + @staticmethod + def _load_old_pis(old_pi_filepath) -> pandas.DataFrame: + try: + old_pi_df = pandas.read_csv( + old_pi_filepath, + dtype={ + invoice.PI_INITIAL_CREDITS: pandas.ArrowDtype( + pyarrow.decimal128(21, 2) + ), + invoice.PI_1ST_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), + invoice.PI_2ND_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), + }, + ) + except FileNotFoundError: + sys.exit("Applying credit 0002 failed. Old PI file does not exist") + + return old_pi_df + + @staticmethod + def _get_pi_age(old_pi_df: pandas.DataFrame, pi, invoice_month): + """Returns time difference between current invoice month and PI's first invoice month + I.e 0 for new PIs + Will raise an error if the PI'a age is negative, which suggests a faulty invoice, or a program bug""" + first_invoice_month = old_pi_df.loc[ + old_pi_df[invoice.PI_PI_FIELD] == pi, invoice.PI_FIRST_MONTH + ] + if first_invoice_month.empty: + return 0 + + month_diff = util.get_month_diff(invoice_month, first_invoice_month.iat[0]) + if month_diff < 0: + sys.exit( + f"PI {pi} from {first_invoice_month} found in {invoice_month} invoice!" + ) + else: + return month_diff + + def _apply_credits_new_pi( + self, data: pandas.DataFrame, old_pi_df: pandas.DataFrame + ): + def get_initial_credit_amount( + old_pi_df, invoice_month, default_initial_credit_amount + ): + first_month_processed_pis = old_pi_df[ + old_pi_df[invoice.PI_FIRST_MONTH] == invoice_month + ] + if first_month_processed_pis[ + invoice.PI_INITIAL_CREDITS + ].empty or pandas.isna( + new_pi_credit_amount := first_month_processed_pis[ + invoice.PI_INITIAL_CREDITS + ].iat[0] + ): + new_pi_credit_amount = default_initial_credit_amount + + return new_pi_credit_amount + + new_pi_credit_amount = get_initial_credit_amount( + old_pi_df, self.invoice_month, self.INITIAL_CREDIT_AMOUNT + ) + print(f"New PI Credit set at {new_pi_credit_amount} for {self.invoice_month}") + + current_pi_set = set(data[invoice.PI_FIELD]) + for pi in current_pi_set: + credit_eligible_projects = data[ + (data[invoice.PI_FIELD] == pi) + & ~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES)) + ] + pi_age = self._get_pi_age(old_pi_df, pi, self.invoice_month) + pi_old_pi_entry = old_pi_df.loc[ + old_pi_df[invoice.PI_PI_FIELD] == pi + ].squeeze() + + if pi_age > 1: + for i, row in credit_eligible_projects.iterrows(): + data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD] + else: + if pi_age == 0: + if len(pi_old_pi_entry) == 0: + pi_entry = [pi, self.invoice_month, new_pi_credit_amount, 0, 0] + old_pi_df = pandas.concat( + [ + pandas.DataFrame([pi_entry], columns=old_pi_df.columns), + old_pi_df, + ], + ignore_index=True, + ) + pi_old_pi_entry = old_pi_df.loc[ + old_pi_df[invoice.PI_PI_FIELD] == pi + ].squeeze() + + remaining_credit = new_pi_credit_amount + credit_used_field = invoice.PI_1ST_USED + elif pi_age == 1: + remaining_credit = ( + pi_old_pi_entry[invoice.PI_INITIAL_CREDITS] + - pi_old_pi_entry[invoice.PI_1ST_USED] + ) + credit_used_field = invoice.PI_2ND_USED + + credits_used = self.apply_flat_discount( + data, + credit_eligible_projects, + remaining_credit, + invoice.CREDIT_FIELD, + invoice.BALANCE_FIELD, + invoice.CREDIT_CODE_FIELD, + self.NEW_PI_CREDIT_CODE, + ) + + if (pi_old_pi_entry[credit_used_field] != 0) and ( + credits_used != pi_old_pi_entry[credit_used_field] + ): + print( + f"Warning: PI file overwritten. PI {pi} previously used ${pi_old_pi_entry[credit_used_field]} of New PI credits, now uses ${credits_used}" + ) + old_pi_df.loc[ + old_pi_df[invoice.PI_PI_FIELD] == pi, credit_used_field + ] = credits_used + + return (data, old_pi_df) + + def _prepare(self): + self.data[invoice.CREDIT_FIELD] = None + self.data[invoice.CREDIT_CODE_FIELD] = None + self.data[invoice.BALANCE_FIELD] = self.data[invoice.COST_FIELD] + self.old_pi_df = self._load_old_pis(self.old_pi_filepath) + + def _process(self): + self.data, self.updated_old_pi_df = self._apply_credits_new_pi( + self.data, self.old_pi_df + ) From 83fa4e1a2131cfdd93f2cba7b9868271c5d4f3d1 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 12:19:53 -0400 Subject: [PATCH 6/9] Implemented processor for applying the New-PI Credit The `DiscountProcessor` class has been created, to be subclassed by processors which applies discounts, such as the New-PI Credit. This processor class introduces an important class constant, `IS_DISCOUNT_BY_NERC`, which detemines whether the final balance, as opposed to the PI balance (more info below) reflects this discount. During discussions about billing, it was made noted that some discounts are not provided by the MGHPCC, but instead rom other sources, such as the BU subsidy. This provided motivation for the `PI Balance` field, which reflects how much money the PI owes, as opposed to the `Balance` field, which reflects how much money the MGHPCC should receive. `apply_discount_on_project` has been slightly modified, where the PI balance and MGHPCC balance is now calculated seperately. For now, the `PI Balance` field will appear in all invoices after the Billable invoice. --- process_report/invoices/billable_invoice.py | 144 +--- process_report/invoices/invoice.py | 4 + process_report/process_report.py | 14 +- .../processors/discount_processor.py | 17 +- .../processors/new_pi_credit_processor.py | 5 +- process_report/tests/unit_tests.py | 738 +++++++++++------- process_report/tests/util.py | 28 +- 7 files changed, 516 insertions(+), 434 deletions(-) diff --git a/process_report/invoices/billable_invoice.py b/process_report/invoices/billable_invoice.py index 320c0e9..da49ce8 100644 --- a/process_report/invoices/billable_invoice.py +++ b/process_report/invoices/billable_invoice.py @@ -1,74 +1,21 @@ from dataclasses import dataclass import logging -import sys import pandas import pyarrow -from process_report.invoices import invoice, discount_invoice -from process_report import util - +from process_report.invoices import invoice logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @dataclass -class BillableInvoice(discount_invoice.DiscountInvoice): - NEW_PI_CREDIT_CODE = "0002" - INITIAL_CREDIT_AMOUNT = 1000 - EXCLUDE_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] +class BillableInvoice(invoice.Invoice): PI_S3_FILEPATH = "PIs/PI.csv" old_pi_filepath: str - - @staticmethod - def _load_old_pis(old_pi_filepath) -> pandas.DataFrame: - try: - old_pi_df = pandas.read_csv( - old_pi_filepath, - dtype={ - invoice.PI_INITIAL_CREDITS: pandas.ArrowDtype( - pyarrow.decimal128(21, 2) - ), - invoice.PI_1ST_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - invoice.PI_2ND_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - }, - ) - except FileNotFoundError: - sys.exit("Applying credit 0002 failed. Old PI file does not exist") - - return old_pi_df - - @staticmethod - def _get_pi_age(old_pi_df: pandas.DataFrame, pi, invoice_month): - """Returns time difference between current invoice month and PI's first invoice month - I.e 0 for new PIs - Will raise an error if the PI'a age is negative, which suggests a faulty invoice, or a program bug""" - first_invoice_month = old_pi_df.loc[ - old_pi_df[invoice.PI_PI_FIELD] == pi, invoice.PI_FIRST_MONTH - ] - if first_invoice_month.empty: - return 0 - - month_diff = util.get_month_diff(invoice_month, first_invoice_month.iat[0]) - if month_diff < 0: - sys.exit( - f"PI {pi} from {first_invoice_month} found in {invoice_month} invoice!" - ) - else: - return month_diff - - def _prepare(self): - self.data[invoice.CREDIT_FIELD] = None - self.data[invoice.CREDIT_CODE_FIELD] = None - self.data[invoice.BALANCE_FIELD] = self.data[invoice.COST_FIELD] - self.old_pi_df = self._load_old_pis(self.old_pi_filepath) - - def _process(self): - self.data, self.updated_old_pi_df = self._apply_credits_new_pi( - self.data, self.old_pi_df - ) + updated_old_pi_df: pandas.DataFrame def _prepare_export(self): self.updated_old_pi_df = self.updated_old_pi_df.astype( @@ -88,88 +35,3 @@ def export(self): def export_s3(self, s3_bucket): super().export_s3(s3_bucket) s3_bucket.upload_file(self.old_pi_filepath, self.PI_S3_FILEPATH) - - def _apply_credits_new_pi( - self, data: pandas.DataFrame, old_pi_df: pandas.DataFrame - ): - def get_initial_credit_amount( - old_pi_df, invoice_month, default_initial_credit_amount - ): - first_month_processed_pis = old_pi_df[ - old_pi_df[invoice.PI_FIRST_MONTH] == invoice_month - ] - if first_month_processed_pis[ - invoice.PI_INITIAL_CREDITS - ].empty or pandas.isna( - new_pi_credit_amount := first_month_processed_pis[ - invoice.PI_INITIAL_CREDITS - ].iat[0] - ): - new_pi_credit_amount = default_initial_credit_amount - - return new_pi_credit_amount - - new_pi_credit_amount = get_initial_credit_amount( - old_pi_df, self.invoice_month, self.INITIAL_CREDIT_AMOUNT - ) - print(f"New PI Credit set at {new_pi_credit_amount} for {self.invoice_month}") - - current_pi_set = set(data[invoice.PI_FIELD]) - for pi in current_pi_set: - credit_eligible_projects = data[ - (data[invoice.PI_FIELD] == pi) - & ~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES)) - ] - pi_age = self._get_pi_age(old_pi_df, pi, self.invoice_month) - pi_old_pi_entry = old_pi_df.loc[ - old_pi_df[invoice.PI_PI_FIELD] == pi - ].squeeze() - - if pi_age > 1: - for i, row in credit_eligible_projects.iterrows(): - data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD] - else: - if pi_age == 0: - if len(pi_old_pi_entry) == 0: - pi_entry = [pi, self.invoice_month, new_pi_credit_amount, 0, 0] - old_pi_df = pandas.concat( - [ - pandas.DataFrame([pi_entry], columns=old_pi_df.columns), - old_pi_df, - ], - ignore_index=True, - ) - pi_old_pi_entry = old_pi_df.loc[ - old_pi_df[invoice.PI_PI_FIELD] == pi - ].squeeze() - - remaining_credit = new_pi_credit_amount - credit_used_field = invoice.PI_1ST_USED - elif pi_age == 1: - remaining_credit = ( - pi_old_pi_entry[invoice.PI_INITIAL_CREDITS] - - pi_old_pi_entry[invoice.PI_1ST_USED] - ) - credit_used_field = invoice.PI_2ND_USED - - credits_used = self.apply_flat_discount( - data, - credit_eligible_projects, - remaining_credit, - invoice.CREDIT_FIELD, - invoice.BALANCE_FIELD, - invoice.CREDIT_CODE_FIELD, - self.NEW_PI_CREDIT_CODE, - ) - - if (pi_old_pi_entry[credit_used_field] != 0) and ( - credits_used != pi_old_pi_entry[credit_used_field] - ): - print( - f"Warning: PI file overwritten. PI {pi} previously used ${pi_old_pi_entry[credit_used_field]} of New PI credits, now uses ${credits_used}" - ) - old_pi_df.loc[ - old_pi_df[invoice.PI_PI_FIELD] == pi, credit_used_field - ] = credits_used - - return (data, old_pi_df) diff --git a/process_report/invoices/invoice.py b/process_report/invoices/invoice.py index 8599ec8..6e9e5e2 100644 --- a/process_report/invoices/invoice.py +++ b/process_report/invoices/invoice.py @@ -30,6 +30,10 @@ BALANCE_FIELD = "Balance" ### +### Invoice additional fields (not used in exporting) +PI_BALANCE_FIELD = "PI Balance" +### + @dataclass class Invoice: diff --git a/process_report/process_report.py b/process_report/process_report.py index bea7983..d872336 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -20,6 +20,7 @@ lenovo_processor, remove_nonbillables_processor, validate_billable_pi_processor, + new_pi_credit_processor, ) ### PI file field names @@ -235,7 +236,7 @@ def main(): nonbillable_projects=projects, ) - ### Remove nonbillables + ### Perform main processing remove_nonbillables_proc = remove_nonbillables_processor.RemoveNonbillables( "", invoice_month, add_institute_proc.data, pi, projects @@ -249,6 +250,14 @@ def main(): ) validate_billable_pi_proc.process() + new_pi_credit_proc = new_pi_credit_processor.NewPICreditProcessor( + "", + invoice_month, + data=validate_billable_pi_proc.data, + old_pi_filepath=old_pi_file, + ) + new_pi_credit_proc.process() + ### Initialize invoices if args.upload_to_s3: @@ -257,8 +266,9 @@ def main(): billable_inv = billable_invoice.BillableInvoice( name=args.output_file, invoice_month=invoice_month, - data=validate_billable_pi_proc.data.copy(), + data=new_pi_credit_proc.data, old_pi_filepath=old_pi_file, + updated_old_pi_df=new_pi_credit_proc.updated_old_pi_df, ) process_and_export_invoices( diff --git a/process_report/processors/discount_processor.py b/process_report/processors/discount_processor.py index 25666bb..d9a8a79 100644 --- a/process_report/processors/discount_processor.py +++ b/process_report/processors/discount_processor.py @@ -6,11 +6,14 @@ @dataclass class DiscountProcessor(processor.Processor): - @staticmethod + IS_DISCOUNT_BY_NERC = True + def apply_flat_discount( + self, invoice: pandas.DataFrame, pi_projects: pandas.DataFrame, - discount_amount: int, + pi_balance_field: str, + discount_amount: float, discount_field: str, balance_field: str, code_field: str = None, @@ -33,9 +36,10 @@ def apply_flat_discount( :param invoice: Dataframe containing all projects :param pi_projects: A subset of `invoice`, containing all projects for a PI you want to apply the discount + :param pi_balance_field: Name of the field of the PI balance :param discount_amount: The discount given to the PI :param discount_field: Name of the field to put the discount amount applied to each project - :param balance_field: Name of the balance field + :param balance_field: Name of the NERC balance field :param code_field: Name of the discount code field :param discount_code: Code of the discount """ @@ -44,9 +48,10 @@ def apply_discount_on_project(remaining_discount_amount, project_i, project): remaining_project_balance = project[balance_field] applied_discount = min(remaining_project_balance, remaining_discount_amount) invoice.at[project_i, discount_field] = applied_discount - invoice.at[project_i, balance_field] = ( - project[balance_field] - applied_discount - ) + balance_after_discount = project[balance_field] - applied_discount + invoice.at[project_i, pi_balance_field] = balance_after_discount + if self.IS_DISCOUNT_BY_NERC: + invoice.at[project_i, balance_field] = balance_after_discount remaining_discount_amount -= applied_discount return remaining_discount_amount diff --git a/process_report/processors/new_pi_credit_processor.py b/process_report/processors/new_pi_credit_processor.py index 7e45ebf..245524b 100644 --- a/process_report/processors/new_pi_credit_processor.py +++ b/process_report/processors/new_pi_credit_processor.py @@ -14,6 +14,7 @@ class NewPICreditProcessor(discount_processor.DiscountProcessor): NEW_PI_CREDIT_CODE = "0002" INITIAL_CREDIT_AMOUNT = 1000 EXCLUDE_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] + IS_DISCOUNT_BY_NERC = True old_pi_filepath: str @@ -99,8 +100,8 @@ def get_initial_credit_amount( pi_entry = [pi, self.invoice_month, new_pi_credit_amount, 0, 0] old_pi_df = pandas.concat( [ - pandas.DataFrame([pi_entry], columns=old_pi_df.columns), old_pi_df, + pandas.DataFrame([pi_entry], columns=old_pi_df.columns), ], ignore_index=True, ) @@ -120,6 +121,7 @@ def get_initial_credit_amount( credits_used = self.apply_flat_discount( data, credit_eligible_projects, + invoice.PI_BALANCE_FIELD, remaining_credit, invoice.CREDIT_FIELD, invoice.BALANCE_FIELD, @@ -142,6 +144,7 @@ def get_initial_credit_amount( def _prepare(self): self.data[invoice.CREDIT_FIELD] = None self.data[invoice.CREDIT_CODE_FIELD] = None + self.data[invoice.PI_BALANCE_FIELD] = self.data[invoice.COST_FIELD] self.data[invoice.BALANCE_FIELD] = self.data[invoice.COST_FIELD] self.old_pi_df = self._load_old_pis(self.old_pi_filepath) diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 02f3909..721d5b7 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -1,7 +1,6 @@ from unittest import TestCase, mock import tempfile import pandas -import pyarrow import os import uuid import math @@ -311,304 +310,505 @@ def test_get_month_diff(self): util.get_month_diff("2024-16", "2025-03") -class TestCredit0002(TestCase): - def setUp(self): - data = { - "Invoice Month": [ - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - ], - "Manager (PI)": [ - "PI1", - "PI2", - "PI3", - "PI4", - "PI4", - "PI5", - "PI7", - "NewPI1", - "NewPI1", - "NewPI2", - "NewPI2", +class TestNewPICreditProcessor(TestCase): + def _assert_result_invoice_and_old_pi_file( + self, + invoice_month, + test_invoice, + test_old_pi_filepath, + answer_invoice, + answer_old_pi_df, + ): + new_pi_credit_proc = test_utils.new_new_pi_credit_processor( + invoice_month=invoice_month, + data=test_invoice, + old_pi_filepath=test_old_pi_filepath, + ) + new_pi_credit_proc.process() + + answer_invoice = answer_invoice.astype(new_pi_credit_proc.data.dtypes) + answer_old_pi_df = answer_old_pi_df.astype( + new_pi_credit_proc.updated_old_pi_df.dtypes + ) + + self.assertTrue(new_pi_credit_proc.data.equals(answer_invoice)) + self.assertTrue(new_pi_credit_proc.updated_old_pi_df.equals(answer_old_pi_df)) + + def setUp(self) -> None: + self.test_old_pi_file = tempfile.NamedTemporaryFile( + delete=False, mode="w+", suffix=".csv" + ) + + def tearDown(self) -> None: + os.remove(self.test_old_pi_file.name) + + def test_no_new_pi(self): + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI" for _ in range(3)], + "Cost": [100 for _ in range(3)], + "SU Type": ["CPU" for _ in range(3)], + } + ) + + # Other fields of old PI file not accessed if PI is no longer + # eligible for new-PI credit + test_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-01"], + "Initial Credits": [1000], + } + ) + test_old_pi_df.to_csv(self.test_old_pi_file.name, index=False) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [None for _ in range(3)], + "Credit Code": [None for _ in range(3)], + "PI Balance": [100 for _ in range(3)], + "Balance": [100 for _ in range(3)], + } + ), ], - "SU Type": [ - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", + axis=1, + ) + + answer_old_pi_df = test_old_pi_df.copy() + + self._assert_result_invoice_and_old_pi_file( + "2024-06", + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, + ) + + def test_one_new_pi(self): + """Invoice with one completely new PI""" + + # One allocation + invoice_month = "2024-06" + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI"], + "Cost": [100], + "SU Type": ["CPU"], + } + ) + + test_old_pi_df = pandas.DataFrame( + columns=[ + "PI", + "First Invoice Month", + "Initial Credits", + "1st Month Used", + "2nd Month Used", + ] + ) + test_old_pi_df.to_csv(self.test_old_pi_file.name, index=False) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [100], + "Credit Code": ["0002"], + "PI Balance": [0], + "Balance": [0], + } + ), ], - "Project - Allocation": [ - "ProjectA", - "ProjectB", - "ProjectC", - "ProjectD", - "ProjectE", - "ProjectF", - "ProjectG", - "ProjectH", - "ProjectI", - "ProjectJ", - "ProjectK", + axis=1, + ) + + answer_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [1000], + "1st Month Used": [100], + "2nd Month Used": [0], + } + ) + + self._assert_result_invoice_and_old_pi_file( + invoice_month, + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, + ) + + # Two allocations, costs partially covered + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI", "PI"], + "Cost": [500, 1000], + "SU Type": ["CPU", "CPU"], + } + ) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [500, 500], + "Credit Code": ["0002", "0002"], + "PI Balance": [0, 500], + "Balance": [0, 500], + } + ), ], - "Cost": [10, 100, 10000, 500, 100, 400, 200, 250, 250, 700, 700], - } - answer_df_dict = { - "Invoice Month": [ - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", + axis=1, + ) + + answer_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [1000], + "1st Month Used": [1000], + "2nd Month Used": [0], + } + ) + + self._assert_result_invoice_and_old_pi_file( + invoice_month, + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, + ) + + # Two allocations, costs completely covered + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI", "PI"], + "Cost": [500, 400], + "SU Type": ["CPU", "CPU"], + } + ) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [500, 400], + "Credit Code": ["0002", "0002"], + "PI Balance": [0, 0], + "Balance": [0, 0], + } + ), ], - "Manager (PI)": [ - "PI1", - "PI2", - "PI3", - "PI4", - "PI4", - "PI5", - "PI7", - "NewPI1", - "NewPI1", - "NewPI2", - "NewPI2", + axis=1, + ) + + answer_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [1000], + "1st Month Used": [900], + "2nd Month Used": [0], + } + ) + + self._assert_result_invoice_and_old_pi_file( + invoice_month, + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, + ) + + def test_one_month_pi(self): + """PI has appeared in invoices for one month""" + + # Remaining credits completely covers costs + invoice_month = "2024-07" + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI"], + "Cost": [200], + "SU Type": ["CPU"], + } + ) + + test_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [1000], + "1st Month Used": [500], + "2nd Month Used": [0], + } + ) + test_old_pi_df.to_csv(self.test_old_pi_file.name, index=False) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [200], + "Credit Code": ["0002"], + "PI Balance": [0], + "Balance": [0], + } + ), ], - "SU Type": [ - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", - "CPU", + axis=1, + ) + + answer_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [1000], + "1st Month Used": [500], + "2nd Month Used": [200], + } + ) + + self._assert_result_invoice_and_old_pi_file( + invoice_month, + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, + ) + + # Remaining credits partially covers costs + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI"], + "Cost": [600], + "SU Type": ["CPU"], + } + ) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [500], + "Credit Code": ["0002"], + "PI Balance": [100], + "Balance": [100], + } + ), ], - "Project - Allocation": [ - "ProjectA", - "ProjectB", - "ProjectC", - "ProjectD", - "ProjectE", - "ProjectF", - "ProjectG", - "ProjectH", - "ProjectI", - "ProjectJ", - "ProjectK", + axis=1, + ) + + answer_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [1000], + "1st Month Used": [500], + "2nd Month Used": [500], + } + ) + + self._assert_result_invoice_and_old_pi_file( + invoice_month, + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, + ) + + def test_two_new_pi(self): + """Two PIs of different age""" + + # Costs partially and completely covered + invoice_month = "2024-07" + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI1", "PI1", "PI2"], + "Cost": [800, 500, 500], + "SU Type": ["CPU", "CPU", "CPU"], + } + ) + + test_old_pi_df = pandas.DataFrame( + { + "PI": ["PI1"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [1000], + "1st Month Used": [500], + "2nd Month Used": [0], + } + ) + test_old_pi_df.to_csv(self.test_old_pi_file.name, index=False) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [500, None, 500], + "Credit Code": ["0002", None, "0002"], + "PI Balance": [300, 500, 0], + "Balance": [300, 500, 0], + } + ), ], - "Cost": [10, 100, 10000, 500, 100, 400, 200, 250, 250, 700, 700], - "Credit": [None, None, None, 100, None, 400, 200, 250, 250, 500, None], - "Credit Code": [ - None, - None, - None, - "0002", - None, - "0002", - "0002", - "0002", - "0002", - "0002", - None, + axis=1, + ) + + answer_old_pi_df = pandas.DataFrame( + { + "PI": ["PI1", "PI2"], + "First Invoice Month": ["2024-06", "2024-07"], + "Initial Credits": [1000, 1000], + "1st Month Used": [500, 500], + "2nd Month Used": [500, 0], + } + ) + + self._assert_result_invoice_and_old_pi_file( + invoice_month, + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, + ) + + def test_old_pi_file_overwritten(self): + """If PI already has entry in Old PI file, + their initial credits and PI entry could be overwritten""" + + invoice_month = "2024-06" + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI", "PI"], + "Cost": [500, 500], + "SU Type": ["CPU", "CPU"], + } + ) + + test_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [500], + "1st Month Used": [200], + "2nd Month Used": [0], + } + ) + test_old_pi_df.to_csv(self.test_old_pi_file.name, index=False) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [500, None], + "Credit Code": ["0002", None], + "PI Balance": [0, 500], + "Balance": [0, 500], + } + ), ], - "Balance": [10, 100, 10000, 400, 100, 0, 0, 0, 0, 200, 700], - } - self.dataframe = pandas.DataFrame(data) - self.dataframe["Credit"] = None - self.dataframe["Credit Code"] = None - self.dataframe["Balance"] = self.dataframe["Cost"] - self.answer_dataframe = pandas.DataFrame(answer_df_dict) - old_pi = [ - "PI,First Invoice Month,Initial Credits,1st Month Used,2nd Month Used", - "PI1,2023-09,500,200,0", - "PI2,2024-01,2000,0,0", - "PI3,2024-01,2000,1000,500", - "PI4,2024-02,1000,900,0", - "PI5,2024-02,1000,300,500", - "PI6,2024-02,1000,700,0", - "PI7,2024-03,500,300,0", # This as current month we're testing, new PIs should get $500 - "PI8,2024-04,1000,500,0", - ] - self.old_pi_df_answer = ( - pandas.DataFrame( - { - "PI": [ - "PI1", - "PI2", - "PI3", - "PI4", - "PI5", - "PI6", - "PI7", - "NewPI1", - "NewPI2", - "PI8", - ], - "First Invoice Month": [ - "2023-09", - "2024-01", - "2024-01", - "2024-02", - "2024-02", - "2024-02", - "2024-03", - "2024-03", - "2024-03", - "2024-04", - ], - "Initial Credits": [ - 500, - 2000, - 2000, - 1000, - 1000, - 1000, - 500, - 500, - 500, - 1000, - ], - "1st Month Used": [200, 0, 1000, 900, 300, 700, 200, 500, 500, 500], - "2nd Month Used": [0, 0, 500, 100, 400, 0, 0, 0, 0, 0], - } - ) - .astype( - { - "Initial Credits": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - "1st Month Used": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - "2nd Month Used": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - }, - ) - .sort_values(by="PI", ignore_index=True) + axis=1, ) - # Contains cases with new, one month old, two month old, older PI, and future PI that hasn't appeared in invoices yet - # For each invoice month, test case where pi has 1 project, >1, and has spare credit - old_pi_file = tempfile.NamedTemporaryFile( - delete=False, mode="w+", suffix=".csv" + answer_old_pi_df = pandas.DataFrame( + { + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [500], + "1st Month Used": [500], + "2nd Month Used": [0], + } + ) + + self._assert_result_invoice_and_old_pi_file( + invoice_month, + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, ) - for pi in old_pi: - old_pi_file.write(pi + "\n") - self.old_pi_file = old_pi_file.name - self.dataframe_no_gpu = pandas.DataFrame( + def test_excluded_su_types(self): + """Certain SU types can be excluded from the credit""" + + invoice_month = "2024-06" + test_invoice = pandas.DataFrame( { - "Invoice Month": [ - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - ], - "Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"], + "Manager (PI)": ["PI", "PI", "PI", "PI"], + "Cost": [600, 600, 600, 600], "SU Type": [ - "GPU", - "OpenShift GPUA100SXM4", - "OpenStack GPUA100SXM4", + "CPU", "OpenShift GPUA100SXM4", + "GPU", "OpenStack GPUA100SXM4", ], - "Cost": [500, 100, 100, 500, 500], } ) - self.dataframe_no_gpu["Credit"] = None - self.dataframe_no_gpu["Credit Code"] = None - self.dataframe_no_gpu["Balance"] = self.dataframe_no_gpu["Cost"] - old_pi_no_gpu = [ - "PI,First Invoice Month,Initial Credits,1st Month Used,2nd Month Used", - "OldPI,2024-03,500,200,0", - ] - old_pi_no_gpu_file = tempfile.NamedTemporaryFile( - delete=False, mode="w", suffix=".csv" + + test_old_pi_df = pandas.DataFrame( + columns=[ + "PI", + "First Invoice Month", + "Initial Credits", + "1st Month Used", + "2nd Month Used", + ] + ) + test_old_pi_df.to_csv(self.test_old_pi_file.name, index=False) + + answer_invoice = pandas.concat( + [ + test_invoice, + pandas.DataFrame( + { + "Credit": [600, None, 400, None], + "Credit Code": ["0002", None, "0002", None], + "PI Balance": [0, 600, 200, 600], + "Balance": [0, 600, 200, 600], + } + ), + ], + axis=1, ) - for pi in old_pi_no_gpu: - old_pi_no_gpu_file.write(pi + "\n") - self.old_pi_no_gpu_file = old_pi_no_gpu_file.name - self.no_gpu_df_answer = pandas.DataFrame( + + answer_old_pi_df = pandas.DataFrame( { - "Invoice Month": [ - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - ], - "Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"], - "SU Type": [ - "GPU", - "OpenShift GPUA100SXM4", - "OpenStack GPUA100SXM4", - "OpenShift GPUA100SXM4", - "OpenStack GPUA100SXM4", - ], - "Cost": [500, 100, 100, 500, 500], - "Credit": [500, None, None, None, None], - "Credit Code": ["0002", None, None, None, None], - "Balance": [0.0, 100.0, 100.0, 500.0, 500.0], + "PI": ["PI"], + "First Invoice Month": ["2024-06"], + "Initial Credits": [1000], + "1st Month Used": [1000], + "2nd Month Used": [0], } ) - def tearDown(self): - os.remove(self.old_pi_file) - os.remove(self.old_pi_no_gpu_file) - - def test_apply_credit_0002(self): - test_invoice = test_utils.new_billable_invoice(invoice_month="2024-03") - old_pi_df = test_invoice._load_old_pis(self.old_pi_file) - dataframe, updated_old_pi_df = test_invoice._apply_credits_new_pi( - self.dataframe, old_pi_df - ) - dataframe = dataframe.astype({"Credit": "float64", "Balance": "int64"}) - updated_old_pi_df = updated_old_pi_df.astype( - dtype={ - "Initial Credits": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - "1st Month Used": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - "2nd Month Used": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - }, - ).sort_values(by=["PI"], ignore_index=True) - self.assertTrue(self.answer_dataframe.equals(dataframe)) - self.assertTrue(self.old_pi_df_answer.equals(updated_old_pi_df)) - - def test_no_gpu(self): - test_invoice = test_utils.new_billable_invoice(invoice_month="2024-03") - old_pi_df = test_invoice._load_old_pis(self.old_pi_no_gpu_file) - dataframe, _ = test_invoice._apply_credits_new_pi( - self.dataframe_no_gpu, old_pi_df - ) - dataframe = dataframe.astype({"Credit": "float64", "Balance": "float64"}) - self.assertTrue(self.no_gpu_df_answer.equals(dataframe)) + self._assert_result_invoice_and_old_pi_file( + invoice_month, + test_invoice, + self.test_old_pi_file.name, + answer_invoice, + answer_old_pi_df, + ) def test_apply_credit_error(self): + """Test faulty data""" old_pi_df = pandas.DataFrame( {"PI": ["PI1"], "First Invoice Month": ["2024-04"]} ) invoice_month = "2024-03" - test_invoice = test_utils.new_billable_invoice() + test_invoice = test_utils.new_new_pi_credit_processor() with self.assertRaises(SystemExit): test_invoice._get_pi_age(old_pi_df, "PI1", invoice_month) diff --git a/process_report/tests/util.py b/process_report/tests/util.py index 100a779..b50c9c9 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -6,22 +6,9 @@ lenovo_processor, remove_nonbillables_processor, validate_billable_pi_processor, + new_pi_credit_processor, ) -from process_report.invoices import billable_invoice, bu_internal_invoice - - -def new_billable_invoice( - name="", - invoice_month="0000-00", - data=pandas.DataFrame(), - old_pi_filepath="", -): - return billable_invoice.BillableInvoice( - name, - invoice_month, - data, - old_pi_filepath, - ) +from process_report.invoices import bu_internal_invoice def new_bu_internal_invoice( @@ -70,3 +57,14 @@ def new_validate_billable_pi_processor( return validate_billable_pi_processor.ValidateBillablePIsProcessor( name, invoice_month, data ) + + +def new_new_pi_credit_processor( + name="", + invoice_month="0000-00", + data=pandas.DataFrame(), + old_pi_filepath="", +): + return new_pi_credit_processor.NewPICreditProcessor( + name, invoice_month, data, old_pi_filepath + ) From c7fe4c11f9cdceca1e116f9e30b6b708381fbf10 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 13:07:18 -0400 Subject: [PATCH 7/9] Initialized processor for BU subsidy and copied over processing logic --- .../processors/bu_subsidy_processor.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 process_report/processors/bu_subsidy_processor.py diff --git a/process_report/processors/bu_subsidy_processor.py b/process_report/processors/bu_subsidy_processor.py new file mode 100644 index 0000000..7ef524f --- /dev/null +++ b/process_report/processors/bu_subsidy_processor.py @@ -0,0 +1,55 @@ +from dataclasses import dataclass +from decimal import Decimal + +from process_report.invoices import invoice +from process_report.processors import discount_processor + + +@dataclass +class BUSubsidyProcessor(discount_processor.DiscountProcessor): + IS_DISCOUNT_BY_NERC = False + + subsidy_amount: int + + def _apply_subsidy(self, dataframe, subsidy_amount): + pi_list = dataframe[invoice.PI_FIELD].unique() + + for pi in pi_list: + pi_projects = dataframe[dataframe[invoice.PI_FIELD] == pi] + self.apply_flat_discount( + dataframe, + pi_projects, + subsidy_amount, + invoice.SUBSIDY_FIELD, + invoice.BALANCE_FIELD, + ) + + return dataframe + + def _prepare(self): + def get_project(row): + project_alloc = row[invoice.PROJECT_FIELD] + if project_alloc.rfind("-") == -1: + return project_alloc + else: + return project_alloc[: project_alloc.rfind("-")] + + self.data = self.data[ + self.data[invoice.INSTITUTION_FIELD] == "Boston University" + ].copy() + self.data["Project"] = self.data.apply(get_project, axis=1) + self.data[invoice.SUBSIDY_FIELD] = Decimal(0) + self.data = self.data[ + [ + invoice.INVOICE_DATE_FIELD, + invoice.PI_FIELD, + "Project", + invoice.COST_FIELD, + invoice.CREDIT_FIELD, + invoice.SUBSIDY_FIELD, + invoice.BALANCE_FIELD, + ] + ] + + def _process(self): + self.data = self._apply_bu_subsidy(self.data, self.subsidy_amount) From 87b6b87a453587153b96977025db39fa6800fa15 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 13:31:58 -0400 Subject: [PATCH 8/9] Implemented processor for BU Subsidy Since all current discounts have been implemented as processors, the `DiscountInvoice` class is now removed. No invoice class will now class `_prepare()` or `_process()`. `BUSubsidyProcessor`, which handles processing for the BU subsidy, requires introduction of a new field, `BU Balance`, because the subsidy is not provided by NERC. Because of this, `BU Balance` indicates the money which BU (not the PI they are subsidizing) owes to the MGHPCC. --- .../invoices/bu_internal_invoice.py | 41 +---- process_report/invoices/discount_invoice.py | 79 ---------- process_report/invoices/invoice.py | 1 + process_report/process_report.py | 10 +- .../processors/bu_subsidy_processor.py | 25 +-- .../processors/discount_processor.py | 12 +- process_report/tests/unit_tests.py | 142 +++++------------- process_report/tests/util.py | 21 +-- 8 files changed, 78 insertions(+), 253 deletions(-) delete mode 100644 process_report/invoices/discount_invoice.py diff --git a/process_report/invoices/bu_internal_invoice.py b/process_report/invoices/bu_internal_invoice.py index 8226b97..525d02e 100644 --- a/process_report/invoices/bu_internal_invoice.py +++ b/process_report/invoices/bu_internal_invoice.py @@ -1,27 +1,13 @@ from dataclasses import dataclass -from decimal import Decimal -import process_report.invoices.invoice as invoice -import process_report.invoices.discount_invoice as discount_invoice +from process_report.invoices import invoice @dataclass -class BUInternalInvoice(discount_invoice.DiscountInvoice): +class BUInternalInvoice(invoice.Invoice): subsidy_amount: int - def _prepare(self): - def get_project(row): - project_alloc = row[invoice.PROJECT_FIELD] - if project_alloc.rfind("-") == -1: - return project_alloc - else: - return project_alloc[: project_alloc.rfind("-")] - - self.data = self.data[ - self.data[invoice.INSTITUTION_FIELD] == "Boston University" - ].copy() - self.data["Project"] = self.data.apply(get_project, axis=1) - self.data[invoice.SUBSIDY_FIELD] = Decimal(0) + def _prepare_export(self): self.data = self.data[ [ invoice.INVOICE_DATE_FIELD, @@ -29,14 +15,12 @@ def get_project(row): "Project", invoice.COST_FIELD, invoice.CREDIT_FIELD, - invoice.SUBSIDY_FIELD, + invoice.BU_BALANCE_FIELD, invoice.BALANCE_FIELD, ] ] - def _process(self): - data_summed_projects = self._sum_project_allocations(self.data) - self.data = self._apply_subsidy(data_summed_projects, self.subsidy_amount) + self.data = self._sum_project_allocations(self.data) def _sum_project_allocations(self, dataframe): """A project may have multiple allocations, and therefore multiple rows @@ -53,18 +37,3 @@ def _sum_project_allocations(self, dataframe): data_no_dup.loc[no_dup_project_mask, sum_fields] = sum_fields_sums return data_no_dup - - def _apply_subsidy(self, dataframe, subsidy_amount): - pi_list = dataframe[invoice.PI_FIELD].unique() - - for pi in pi_list: - pi_projects = dataframe[dataframe[invoice.PI_FIELD] == pi] - self.apply_flat_discount( - dataframe, - pi_projects, - subsidy_amount, - invoice.SUBSIDY_FIELD, - invoice.BALANCE_FIELD, - ) - - return dataframe diff --git a/process_report/invoices/discount_invoice.py b/process_report/invoices/discount_invoice.py deleted file mode 100644 index 333b998..0000000 --- a/process_report/invoices/discount_invoice.py +++ /dev/null @@ -1,79 +0,0 @@ -from dataclasses import dataclass - -import pandas - -import process_report.invoices.invoice as invoice - - -@dataclass -class DiscountInvoice(invoice.Invoice): - """ - Invoice class containing functions useful for applying discounts - on dataframes - """ - - @staticmethod - def apply_flat_discount( - invoice: pandas.DataFrame, - pi_projects: pandas.DataFrame, - discount_amount: int, - discount_field: str, - balance_field: str, - code_field: str = None, - discount_code: str = None, - ): - """ - Takes in an invoice and a list of PI projects that are a subset of it, - and applies a flat discount to those PI projects. Note that this function - will change the provided `invoice` Dataframe directly. Therefore, it does - not return the changed invoice. - - This function assumes that the balance field shows the remaining cost of the project, - or what the PI would pay before the flat discount is applied. - - If the optional parameters `code_field` and `discount_code` are passed in, - `discount_code` will be comma-APPENDED to the `code_field` of projects where - the discount is applied - - Returns the amount of discount used. - - :param invoice: Dataframe containing all projects - :param pi_projects: A subset of `invoice`, containing all projects for a PI you want to apply the discount - :param discount_amount: The discount given to the PI - :param discount_field: Name of the field to put the discount amount applied to each project - :param balance_field: Name of the balance field - :param code_field: Name of the discount code field - :param discount_code: Code of the discount - """ - - def apply_discount_on_project(remaining_discount_amount, project_i, project): - remaining_project_balance = project[balance_field] - applied_discount = min(remaining_project_balance, remaining_discount_amount) - invoice.at[project_i, discount_field] = applied_discount - invoice.at[project_i, balance_field] = ( - project[balance_field] - applied_discount - ) - remaining_discount_amount -= applied_discount - return remaining_discount_amount - - def apply_credit_code_on_project(project_i): - if code_field and discount_code: - if pandas.isna(invoice.at[project_i, code_field]): - invoice.at[project_i, code_field] = discount_code - else: - invoice.at[project_i, code_field] = ( - invoice.at[project_i, code_field] + "," + discount_code - ) - - remaining_discount_amount = discount_amount - for i, row in pi_projects.iterrows(): - if remaining_discount_amount == 0: - break - else: - remaining_discount_amount = apply_discount_on_project( - remaining_discount_amount, i, row - ) - apply_credit_code_on_project(i) - - discount_used = discount_amount - remaining_discount_amount - return discount_used diff --git a/process_report/invoices/invoice.py b/process_report/invoices/invoice.py index 6e9e5e2..8cf200e 100644 --- a/process_report/invoices/invoice.py +++ b/process_report/invoices/invoice.py @@ -32,6 +32,7 @@ ### Invoice additional fields (not used in exporting) PI_BALANCE_FIELD = "PI Balance" +BU_BALANCE_FIELD = "BU Balance" ### diff --git a/process_report/process_report.py b/process_report/process_report.py index d872336..bba3a22 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -21,6 +21,7 @@ remove_nonbillables_processor, validate_billable_pi_processor, new_pi_credit_processor, + bu_subsidy_processor, ) ### PI file field names @@ -258,6 +259,11 @@ def main(): ) new_pi_credit_proc.process() + bu_subsidy_proc = bu_subsidy_processor.BUSubsidyProcessor( + "", invoice_month, new_pi_credit_proc.data.copy(), args.BU_subsidy_amount + ) + bu_subsidy_proc.process() + ### Initialize invoices if args.upload_to_s3: @@ -278,13 +284,13 @@ def main(): nerc_total_inv = NERC_total_invoice.NERCTotalInvoice( name=args.NERC_total_invoice_file, invoice_month=invoice_month, - data=billable_inv.data.copy(), + data=new_pi_credit_proc.data.copy(), ) bu_internal_inv = bu_internal_invoice.BUInternalInvoice( name=args.BU_invoice_file, invoice_month=invoice_month, - data=billable_inv.data.copy(), + data=bu_subsidy_proc.data.copy(), subsidy_amount=args.BU_subsidy_amount, ) diff --git a/process_report/processors/bu_subsidy_processor.py b/process_report/processors/bu_subsidy_processor.py index 7ef524f..b93299d 100644 --- a/process_report/processors/bu_subsidy_processor.py +++ b/process_report/processors/bu_subsidy_processor.py @@ -11,16 +11,19 @@ class BUSubsidyProcessor(discount_processor.DiscountProcessor): subsidy_amount: int - def _apply_subsidy(self, dataframe, subsidy_amount): - pi_list = dataframe[invoice.PI_FIELD].unique() + def _apply_bu_subsidy(self, dataframe, subsidy_amount): + pi_list = dataframe[ + dataframe[invoice.INSTITUTION_FIELD] == "Boston University" + ][invoice.PI_FIELD].unique() for pi in pi_list: pi_projects = dataframe[dataframe[invoice.PI_FIELD] == pi] self.apply_flat_discount( dataframe, pi_projects, + invoice.PI_BALANCE_FIELD, subsidy_amount, - invoice.SUBSIDY_FIELD, + invoice.BU_BALANCE_FIELD, invoice.BALANCE_FIELD, ) @@ -34,22 +37,8 @@ def get_project(row): else: return project_alloc[: project_alloc.rfind("-")] - self.data = self.data[ - self.data[invoice.INSTITUTION_FIELD] == "Boston University" - ].copy() self.data["Project"] = self.data.apply(get_project, axis=1) - self.data[invoice.SUBSIDY_FIELD] = Decimal(0) - self.data = self.data[ - [ - invoice.INVOICE_DATE_FIELD, - invoice.PI_FIELD, - "Project", - invoice.COST_FIELD, - invoice.CREDIT_FIELD, - invoice.SUBSIDY_FIELD, - invoice.BALANCE_FIELD, - ] - ] + self.data[invoice.BU_BALANCE_FIELD] = Decimal(0) def _process(self): self.data = self._apply_bu_subsidy(self.data, self.subsidy_amount) diff --git a/process_report/processors/discount_processor.py b/process_report/processors/discount_processor.py index d9a8a79..353111e 100644 --- a/process_report/processors/discount_processor.py +++ b/process_report/processors/discount_processor.py @@ -25,13 +25,16 @@ def apply_flat_discount( will change the provided `invoice` Dataframe directly. Therefore, it does not return the changed invoice. - This function assumes that the balance field shows the remaining cost of the project, + This function assumes that the PI balance field shows the remaining cost of the project, or what the PI would pay before the flat discount is applied. If the optional parameters `code_field` and `discount_code` are passed in, `discount_code` will be comma-APPENDED to the `code_field` of projects where the discount is applied + If `IS_DISCOUNT_BY_NERC` is true, the discount will be reflected in the + MGHPCC Balance field. + Returns the amount of discount used. :param invoice: Dataframe containing all projects @@ -39,17 +42,18 @@ def apply_flat_discount( :param pi_balance_field: Name of the field of the PI balance :param discount_amount: The discount given to the PI :param discount_field: Name of the field to put the discount amount applied to each project - :param balance_field: Name of the NERC balance field + :param balance_field: Name of the MGHPCC balance field :param code_field: Name of the discount code field :param discount_code: Code of the discount """ def apply_discount_on_project(remaining_discount_amount, project_i, project): - remaining_project_balance = project[balance_field] + remaining_project_balance = project[pi_balance_field] applied_discount = min(remaining_project_balance, remaining_discount_amount) invoice.at[project_i, discount_field] = applied_discount + pi_balance_after_discount = project[pi_balance_field] - applied_discount balance_after_discount = project[balance_field] - applied_discount - invoice.at[project_i, pi_balance_field] = balance_after_discount + invoice.at[project_i, pi_balance_field] = pi_balance_after_discount if self.IS_DISCOUNT_BY_NERC: invoice.at[project_i, balance_field] = balance_after_discount remaining_discount_amount -= applied_discount diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 721d5b7..229cc98 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -813,117 +813,49 @@ def test_apply_credit_error(self): test_invoice._get_pi_age(old_pi_df, "PI1", invoice_month) -class TestBUSubsidy(TestCase): - def setUp(self): - data = { - "Invoice Month": [ - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - "2024-03", - ], - "Manager (PI)": ["PI1", "PI1", "PI2", "PI2", "PI3", "PI3", "PI4", "PI4"], - "Institution": [ - "Boston University", - "Boston University", - "Boston University", - "Boston University", - "Harvard University", # Test case for non-BU PIs - "Harvard University", - "Boston University", - "Boston University", - ], - "Project - Allocation": [ - "ProjectA-e6413", - "ProjectA-t575e6", # Test case for project with >1 allocation - "ProjectB-fddgfygg", - "ProjectB-5t143t", - "ProjectC-t14334", - "ProjectD", # Test case for correctly extracting project name - "ProjectE-test-r25135", # Test case for BU PI with >1 project - "ProjectF", - ], - "Cost": [1050, 500, 100, 925, 10000, 1000, 1050, 100], - "Credit": [ - 1000, - 0, - 100, - 900, - 0, - 0, - 1000, - 0, - ], # Test cases where PI does/dones't have credits alreadys - "Balance": [ - 50, - 500, - 0, - 25, - 10000, - 1000, - 50, - 100, - ], # Test case where subsidy does/doesn't cover fully balance - } - self.dataframe = pandas.DataFrame(data) - self.subsidy = 100 - - def test_apply_BU_subsidy(self): - test_invoice = test_utils.new_bu_internal_invoice( - data=self.dataframe, subsidy_amount=self.subsidy - ) - test_invoice.process() - output_df = test_invoice.data.reset_index() - - self.assertTrue( - set( - [ - process_report.INVOICE_DATE_FIELD, - "Project", - process_report.PI_FIELD, - process_report.COST_FIELD, - process_report.CREDIT_FIELD, - process_report.SUBSIDY_FIELD, - process_report.BALANCE_FIELD, - ] - ).issubset(output_df) +class TestBUSubsidyProcessor(TestCase): + def test_apply_subsidy(self): + subsidy_amount = 500 + invoice_month = "2024-06" + test_invoice = pandas.DataFrame( + { + "Manager (PI)": ["PI1", "PI1", "PI2", "PI2", "PI3", "PI3"], + "Institution": [ + "Boston University", + "Boston University", + "Harvard University", # Test case for non-BU PIs + "Harvard University", + "Boston University", + "Boston University", + ], + "Project - Allocation": [ + "P1-A1", + "P2-A1", + "P3-A1", + "P3-A2", + "P4", # 2 Test cases for correctly extracting project name + "P4-P4-A1", + ], + "PI Balance": [400, 600, 1000, 2000, 500, 500], + "Balance": [400, 600, 1000, 2000, 500, 500], + } ) - self.assertTrue( - set(["PI1", "PI2", "PI4"]).issubset(output_df["Manager (PI)"].unique()) - ) - self.assertFalse("PI3" in output_df["Project"].unique()) + answer_invoice = test_invoice.copy() + answer_invoice["Project"] = ["P1", "P2", "P3", "P3", "P4", "P4-P4"] + answer_invoice["BU Balance"] = [400, 100, 0, 0, 500, 0] + answer_invoice["PI Balance"] = [0, 500, 1000, 2000, 0, 500] - self.assertTrue( - set(["ProjectA", "ProjectB", "ProjectE-test", "ProjectF"]).issubset( - output_df["Project"].unique() - ) - ) - self.assertFalse( - set(["ProjectC-t14334", "ProjectC", "ProjectD"]).intersection( - output_df["Project"].unique() - ) + bu_subsidy_proc = test_utils.new_bu_subsidy_processor( + invoice_month=invoice_month, + data=test_invoice, + subsidy_amount=subsidy_amount, ) + bu_subsidy_proc.process() - self.assertEqual(4, len(output_df.index)) - self.assertEqual(1550, output_df.loc[0, "Cost"]) - self.assertEqual(1025, output_df.loc[1, "Cost"]) - self.assertEqual(1050, output_df.loc[2, "Cost"]) - self.assertEqual(100, output_df.loc[3, "Cost"]) - - self.assertEqual(100, output_df.loc[0, "Subsidy"]) - self.assertEqual(25, output_df.loc[1, "Subsidy"]) - self.assertEqual(50, output_df.loc[2, "Subsidy"]) - self.assertEqual(50, output_df.loc[3, "Subsidy"]) + answer_invoice = answer_invoice.astype(bu_subsidy_proc.data.dtypes) - self.assertEqual(450, output_df.loc[0, "Balance"]) - self.assertEqual(0, output_df.loc[1, "Balance"]) - self.assertEqual(0, output_df.loc[2, "Balance"]) - self.assertEqual(50, output_df.loc[3, "Balance"]) + self.assertTrue(bu_subsidy_proc.data.equals(answer_invoice)) class TestLenovoProcessor(TestCase): diff --git a/process_report/tests/util.py b/process_report/tests/util.py index b50c9c9..6652ef4 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -7,16 +7,8 @@ remove_nonbillables_processor, validate_billable_pi_processor, new_pi_credit_processor, + bu_subsidy_processor, ) -from process_report.invoices import bu_internal_invoice - - -def new_bu_internal_invoice( - name="", invoice_month="0000-00", data=pandas.DataFrame(), subsidy_amount=0 -): - return bu_internal_invoice.BUInternalInvoice( - name, invoice_month, data, subsidy_amount - ) def new_add_institution_processor( @@ -68,3 +60,14 @@ def new_new_pi_credit_processor( return new_pi_credit_processor.NewPICreditProcessor( name, invoice_month, data, old_pi_filepath ) + + +def new_bu_subsidy_processor( + name="", + invoice_month="0000-00", + data=pandas.DataFrame(), + subsidy_amount="", +): + return bu_subsidy_processor.BUSubsidyProcessor( + name, invoice_month, data, subsidy_amount + ) From 8ba638c29c43151b3aa74f1ec78133351a49f1a4 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Wed, 18 Sep 2024 15:00:15 -0400 Subject: [PATCH 9/9] Allow invoice subclasses to explicitly declare their exported columns and column names Two class attributes, `export_columns_list` and `exported_columns_map`, has been added to `Invoice`, along with a class function `_filter_columns()`. Subclasses of `Invoice` must now define `export_columns_list`, containing the ordered list of columns that must be exported in their respective invoices. Subclasses can optional define `exported_columns_map`, containing mappings between "internal" column names and what their name should be when exported. The field name `RATE_FIELD` has been added to `invoice.py`. It was previously forgotten. --- process_report/invoices/NERC_total_invoice.py | 18 ++++++++++ process_report/invoices/billable_invoice.py | 18 ++++++++++ .../invoices/bu_internal_invoice.py | 33 +++++++++++-------- process_report/invoices/invoice.py | 11 +++++++ process_report/invoices/lenovo_invoice.py | 22 ++++++------- .../invoices/nonbillable_invoice.py | 15 +++++++++ process_report/process_report.py | 17 +++++----- process_report/tests/unit_tests.py | 12 +++++++ process_report/tests/util.py | 9 +++++ 9 files changed, 122 insertions(+), 33 deletions(-) diff --git a/process_report/invoices/NERC_total_invoice.py b/process_report/invoices/NERC_total_invoice.py index 9133333..335c52a 100644 --- a/process_report/invoices/NERC_total_invoice.py +++ b/process_report/invoices/NERC_total_invoice.py @@ -12,6 +12,24 @@ class NERCTotalInvoice(invoice.Invoice): "University of Rhode Island", ] + export_columns_list = [ + invoice.INVOICE_DATE_FIELD, + invoice.PROJECT_FIELD, + invoice.PROJECT_ID_FIELD, + invoice.PI_FIELD, + invoice.INVOICE_EMAIL_FIELD, + invoice.INVOICE_ADDRESS_FIELD, + invoice.INSTITUTION_FIELD, + invoice.INSTITUTION_ID_FIELD, + invoice.SU_HOURS_FIELD, + invoice.SU_TYPE_FIELD, + invoice.RATE_FIELD, + invoice.COST_FIELD, + invoice.CREDIT_FIELD, + invoice.CREDIT_CODE_FIELD, + invoice.BALANCE_FIELD, + ] + @property def output_path(self) -> str: return f"NERC-{self.invoice_month}-Total-Invoice.csv" diff --git a/process_report/invoices/billable_invoice.py b/process_report/invoices/billable_invoice.py index da49ce8..a1d0363 100644 --- a/process_report/invoices/billable_invoice.py +++ b/process_report/invoices/billable_invoice.py @@ -14,6 +14,24 @@ class BillableInvoice(invoice.Invoice): PI_S3_FILEPATH = "PIs/PI.csv" + export_columns_list = [ + invoice.INVOICE_DATE_FIELD, + invoice.PROJECT_FIELD, + invoice.PROJECT_ID_FIELD, + invoice.PI_FIELD, + invoice.INVOICE_EMAIL_FIELD, + invoice.INVOICE_ADDRESS_FIELD, + invoice.INSTITUTION_FIELD, + invoice.INSTITUTION_ID_FIELD, + invoice.SU_HOURS_FIELD, + invoice.SU_TYPE_FIELD, + invoice.RATE_FIELD, + invoice.COST_FIELD, + invoice.CREDIT_FIELD, + invoice.CREDIT_CODE_FIELD, + invoice.BALANCE_FIELD, + ] + old_pi_filepath: str updated_old_pi_df: pandas.DataFrame diff --git a/process_report/invoices/bu_internal_invoice.py b/process_report/invoices/bu_internal_invoice.py index 525d02e..df11b8b 100644 --- a/process_report/invoices/bu_internal_invoice.py +++ b/process_report/invoices/bu_internal_invoice.py @@ -5,21 +5,23 @@ @dataclass class BUInternalInvoice(invoice.Invoice): + export_columns_list = [ + invoice.INVOICE_DATE_FIELD, + invoice.PI_FIELD, + "Project", + invoice.COST_FIELD, + invoice.CREDIT_FIELD, + invoice.BU_BALANCE_FIELD, + invoice.PI_BALANCE_FIELD, + ] + exported_columns_map = { + invoice.BU_BALANCE_FIELD: "Subsidy", + invoice.PI_BALANCE_FIELD: "Balance", + } + subsidy_amount: int def _prepare_export(self): - self.data = self.data[ - [ - invoice.INVOICE_DATE_FIELD, - invoice.PI_FIELD, - "Project", - invoice.COST_FIELD, - invoice.CREDIT_FIELD, - invoice.BU_BALANCE_FIELD, - invoice.BALANCE_FIELD, - ] - ] - self.data = self._sum_project_allocations(self.data) def _sum_project_allocations(self, dataframe): @@ -28,7 +30,12 @@ def _sum_project_allocations(self, dataframe): each unique project, summing up its allocations' costs""" project_list = dataframe["Project"].unique() data_no_dup = dataframe.drop_duplicates("Project", inplace=False) - sum_fields = [invoice.COST_FIELD, invoice.CREDIT_FIELD, invoice.BALANCE_FIELD] + sum_fields = [ + invoice.COST_FIELD, + invoice.CREDIT_FIELD, + invoice.BU_BALANCE_FIELD, + invoice.PI_BALANCE_FIELD, + ] for project in project_list: project_mask = dataframe["Project"] == project no_dup_project_mask = data_no_dup["Project"] == project diff --git a/process_report/invoices/invoice.py b/process_report/invoices/invoice.py index 8cf200e..85c0c07 100644 --- a/process_report/invoices/invoice.py +++ b/process_report/invoices/invoice.py @@ -23,6 +23,7 @@ INSTITUTION_ID_FIELD = "Institution - Specific Code" SU_HOURS_FIELD = "SU Hours (GBhr or SUhr)" SU_TYPE_FIELD = "SU Type" +RATE_FIELD = "Rate" COST_FIELD = "Cost" CREDIT_FIELD = "Credit" CREDIT_CODE_FIELD = "Credit Code" @@ -38,6 +39,9 @@ @dataclass class Invoice: + export_columns_list = list() + exported_columns_map = dict() + name: str invoice_month: str data: pandas.DataFrame @@ -83,7 +87,14 @@ def _prepare_export(self): that should or should not be exported after processing.""" pass + def _filter_columns(self): + """Filters and renames columns before exporting""" + self.data = self.data[self.export_columns_list].rename( + columns=self.exported_columns_map + ) + def export(self): + self._filter_columns() self.data.to_csv(self.output_path, index=False) def export_s3(self, s3_bucket): diff --git a/process_report/invoices/lenovo_invoice.py b/process_report/invoices/lenovo_invoice.py index ea47d75..c4a6e42 100644 --- a/process_report/invoices/lenovo_invoice.py +++ b/process_report/invoices/lenovo_invoice.py @@ -7,17 +7,17 @@ class LenovoInvoice(invoice.Invoice): LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] + export_columns_list = [ + invoice.INVOICE_DATE_FIELD, + invoice.PROJECT_FIELD, + invoice.INSTITUTION_FIELD, + invoice.SU_HOURS_FIELD, + invoice.SU_TYPE_FIELD, + "Charge", + ] + exported_columns_map = {invoice.SU_HOURS_FIELD: "SU Hours"} + def _prepare_export(self): self.data = self.data[ self.data[invoice.SU_TYPE_FIELD].isin(self.LENOVO_SU_TYPES) - ][ - [ - invoice.INVOICE_DATE_FIELD, - invoice.PROJECT_FIELD, - invoice.INSTITUTION_FIELD, - invoice.SU_HOURS_FIELD, - invoice.SU_TYPE_FIELD, - ] - ].copy() - - self.data.rename(columns={invoice.SU_HOURS_FIELD: "SU Hours"}, inplace=True) + ] diff --git a/process_report/invoices/nonbillable_invoice.py b/process_report/invoices/nonbillable_invoice.py index 701d308..508745a 100644 --- a/process_report/invoices/nonbillable_invoice.py +++ b/process_report/invoices/nonbillable_invoice.py @@ -8,6 +8,21 @@ class NonbillableInvoice(invoice.Invoice): nonbillable_pis: list[str] nonbillable_projects: list[str] + export_columns_list = [ + invoice.INVOICE_DATE_FIELD, + invoice.PROJECT_FIELD, + invoice.PROJECT_ID_FIELD, + invoice.PI_FIELD, + invoice.INVOICE_EMAIL_FIELD, + invoice.INVOICE_ADDRESS_FIELD, + invoice.INSTITUTION_FIELD, + invoice.INSTITUTION_ID_FIELD, + invoice.SU_HOURS_FIELD, + invoice.SU_TYPE_FIELD, + invoice.RATE_FIELD, + invoice.COST_FIELD, + ] + def _prepare_export(self): self.data = self.data[ self.data[invoice.PI_FIELD].isin(self.nonbillable_pis) diff --git a/process_report/process_report.py b/process_report/process_report.py index bba3a22..be3b1bd 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -232,7 +232,7 @@ def main(): nonbillable_inv = nonbillable_invoice.NonbillableInvoice( name=args.nonbillable_file, invoice_month=invoice_month, - data=add_institute_proc.data, + data=lenovo_proc.data, nonbillable_pis=pi, nonbillable_projects=projects, ) @@ -240,7 +240,7 @@ def main(): ### Perform main processing remove_nonbillables_proc = remove_nonbillables_processor.RemoveNonbillables( - "", invoice_month, add_institute_proc.data, pi, projects + "", invoice_month, lenovo_proc.data, pi, projects ) remove_nonbillables_proc.process() @@ -272,19 +272,15 @@ def main(): billable_inv = billable_invoice.BillableInvoice( name=args.output_file, invoice_month=invoice_month, - data=new_pi_credit_proc.data, + data=bu_subsidy_proc.data, old_pi_filepath=old_pi_file, updated_old_pi_df=new_pi_credit_proc.updated_old_pi_df, ) - process_and_export_invoices( - [lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3 - ) - nerc_total_inv = NERC_total_invoice.NERCTotalInvoice( name=args.NERC_total_invoice_file, invoice_month=invoice_month, - data=new_pi_credit_proc.data.copy(), + data=bu_subsidy_proc.data.copy(), ) bu_internal_inv = bu_internal_invoice.BUInternalInvoice( @@ -294,7 +290,10 @@ def main(): subsidy_amount=args.BU_subsidy_amount, ) - process_and_export_invoices([nerc_total_inv, bu_internal_inv], args.upload_to_s3) + process_and_export_invoices( + [lenovo_inv, nonbillable_inv, billable_inv, nerc_total_inv, bu_internal_inv], + args.upload_to_s3, + ) export_pi_billables(billable_inv.data.copy(), args.output_folder, invoice_month) diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 229cc98..d5b879f 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -907,3 +907,15 @@ def test_remove_prefix(self, mock_get_time, mock_get_bucket): process_report.upload_to_s3(filenames, invoice_month) for i, call_args in enumerate(mock_bucket.upload_file.call_args_list): self.assertTrue(answers[i] in call_args) + + +class TestBaseInvoice(TestCase): + def test_filter_exported_columns(self): + test_invoice = pandas.DataFrame(columns=["C1", "C2", "C3", "C4", "C5"]) + answer_invoice = pandas.DataFrame(columns=["C1", "C3R", "C5R"]) + inv = test_utils.new_base_invoice(data=test_invoice) + inv.export_columns_list = ["C1", "C3", "C5"] + inv.exported_columns_map = {"C3": "C3R", "C5": "C5R"} + inv._filter_columns() + + self.assertTrue(inv.data.equals(answer_invoice)) diff --git a/process_report/tests/util.py b/process_report/tests/util.py index 6652ef4..8d0a935 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -9,6 +9,7 @@ new_pi_credit_processor, bu_subsidy_processor, ) +from process_report.invoices import invoice def new_add_institution_processor( @@ -71,3 +72,11 @@ def new_bu_subsidy_processor( return bu_subsidy_processor.BUSubsidyProcessor( name, invoice_month, data, subsidy_amount ) + + +def new_base_invoice( + name="", + invoice_month="0000-00", + data=pandas.DataFrame(), +): + return invoice.Invoice(name, invoice_month, data)