diff --git a/process_report/invoices/billable_invoice.py b/process_report/invoices/billable_invoice.py new file mode 100644 index 0000000..58e65ac --- /dev/null +++ b/process_report/invoices/billable_invoice.py @@ -0,0 +1,186 @@ +from dataclasses import dataclass +from decimal import Decimal +import logging +import sys + +import pandas +import pyarrow + +import process_report.invoices.invoice as invoice +import process_report.util as util + + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +@dataclass +class BillableInvoice(invoice.Invoice): + nonbillable_pis: list[str] + nonbillable_projects: list[str] + old_pi_filepath: str + + def _prepare(self): + self.data = self._remove_nonbillables( + self.data, self.nonbillable_pis, self.nonbillable_projects + ) + self.data = self._validate_pi_names(self.data) + + def _process(self): + old_pi_df = self._load_old_pis(self.old_pi_filepath) + self.data, updated_old_pi_df = self._apply_credits_new_pi(self.data, old_pi_df) + self._dump_old_pis(self.old_pi_filepath, updated_old_pi_df) + + def _remove_nonbillables( + self, + data: pandas.DataFrame, + nonbillable_pis: list[str], + nonbillable_projects: list[str], + ): + return data[ + ~data[invoice.PI_FIELD].isin(nonbillable_pis) + & ~data[invoice.PROJECT_FIELD].isin(nonbillable_projects) + ] + + def _validate_pi_names(self, data: pandas.DataFrame): + invalid_pi_projects = data[pandas.isna(data[invoice.PI_FIELD])] + for i, row in invalid_pi_projects.iterrows(): + logger.warn( + f"Billable project {row[invoice.PROJECT_FIELD]} has empty PI field" + ) + return data[~pandas.isna(data[invoice.PI_FIELD])] + + def _load_old_pis(self, old_pi_filepath) -> pandas.DataFrame: + try: + old_pi_df = pandas.read_csv( + old_pi_filepath, + dtype={ + invoice.PI_INITIAL_CREDITS: pandas.ArrowDtype( + pyarrow.decimal128(21, 2) + ), + invoice.PI_1ST_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), + invoice.PI_2ND_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), + }, + ) + except FileNotFoundError: + sys.exit("Applying credit 0002 failed. Old PI file does not exist") + + return old_pi_df + + def _apply_credits_new_pi( + self, data: pandas.DataFrame, old_pi_df: pandas.DataFrame + ): + new_pi_credit_code = "0002" + INITIAL_CREDIT_AMOUNT = 1000 + EXCLUDE_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] + + data[invoice.CREDIT_FIELD] = None + data[invoice.CREDIT_CODE_FIELD] = None + data[invoice.BALANCE_FIELD] = Decimal(0) + + current_pi_set = set(data[invoice.PI_FIELD]) + invoice_month = data[invoice.INVOICE_DATE_FIELD].iat[0] + invoice_pis = old_pi_df[old_pi_df[invoice.PI_FIRST_MONTH] == invoice_month] + if invoice_pis[invoice.PI_INITIAL_CREDITS].empty or pandas.isna( + new_pi_credit_amount := invoice_pis[invoice.PI_INITIAL_CREDITS].iat[0] + ): + new_pi_credit_amount = INITIAL_CREDIT_AMOUNT + + print(f"New PI Credit set at {new_pi_credit_amount} for {invoice_month}") + + for pi in current_pi_set: + pi_projects = data[data[invoice.PI_FIELD] == pi] + pi_age = self._get_pi_age(old_pi_df, pi, invoice_month) + pi_old_pi_entry = old_pi_df.loc[ + old_pi_df[invoice.PI_PI_FIELD] == pi + ].squeeze() + + if pi_age > 1: + for i, row in pi_projects.iterrows(): + data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD] + else: + if pi_age == 0: + if len(pi_old_pi_entry) == 0: + pi_entry = [pi, invoice_month, new_pi_credit_amount, 0, 0] + old_pi_df = pandas.concat( + [ + pandas.DataFrame([pi_entry], columns=old_pi_df.columns), + old_pi_df, + ], + ignore_index=True, + ) + pi_old_pi_entry = old_pi_df.loc[ + old_pi_df[invoice.PI_PI_FIELD] == pi + ].squeeze() + + remaining_credit = new_pi_credit_amount + credit_used_field = invoice.PI_1ST_USED + elif pi_age == 1: + remaining_credit = ( + pi_old_pi_entry[invoice.PI_INITIAL_CREDITS] + - pi_old_pi_entry[invoice.PI_1ST_USED] + ) + credit_used_field = invoice.PI_2ND_USED + + initial_credit = remaining_credit + for i, row in pi_projects.iterrows(): + if ( + remaining_credit == 0 + or row[invoice.SU_TYPE_FIELD] in EXCLUDE_SU_TYPES + ): + data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD] + else: + project_cost = row[invoice.COST_FIELD] + applied_credit = min(project_cost, remaining_credit) + + data.at[i, invoice.CREDIT_FIELD] = applied_credit + data.at[i, invoice.CREDIT_CODE_FIELD] = new_pi_credit_code + data.at[i, invoice.BALANCE_FIELD] = ( + row[invoice.COST_FIELD] - applied_credit + ) + remaining_credit -= applied_credit + + credits_used = initial_credit - remaining_credit + if (pi_old_pi_entry[credit_used_field] != 0) and ( + credits_used != pi_old_pi_entry[credit_used_field] + ): + print( + f"Warning: PI file overwritten. PI {pi} previously used ${pi_old_pi_entry[credit_used_field]} of New PI credits, now uses ${credits_used}" + ) + old_pi_df.loc[ + old_pi_df[invoice.PI_PI_FIELD] == pi, credit_used_field + ] = credits_used + + old_pi_df = old_pi_df.astype( + { + invoice.PI_INITIAL_CREDITS: pandas.ArrowDtype( + pyarrow.decimal128(21, 2) + ), + invoice.PI_1ST_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), + invoice.PI_2ND_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), + }, + ) + + return (data, old_pi_df) + + def _dump_old_pis(self, old_pi_filepath, old_pi_df: pandas.DataFrame): + old_pi_df.to_csv(old_pi_filepath, index=False) + + def _get_pi_age(self, old_pi_df: pandas.DataFrame, pi, invoice_month): + """Returns time difference between current invoice month and PI's first invoice month + I.e 0 for new PIs + + Will raise an error if the PI'a age is negative, which suggests a faulty invoice, or a program bug""" + first_invoice_month = old_pi_df.loc[ + old_pi_df[invoice.PI_PI_FIELD] == pi, invoice.PI_FIRST_MONTH + ] + if first_invoice_month.empty: + return 0 + + month_diff = util.get_month_diff(invoice_month, first_invoice_month.iat[0]) + if month_diff < 0: + sys.exit( + f"PI {pi} from {first_invoice_month} found in {invoice_month} invoice!" + ) + else: + return month_diff diff --git a/process_report/invoices/invoice.py b/process_report/invoices/invoice.py index 446a6f6..23c2371 100644 --- a/process_report/invoices/invoice.py +++ b/process_report/invoices/invoice.py @@ -4,6 +4,14 @@ import process_report.util as util +### PI file field names +PI_PI_FIELD = "PI" +PI_FIRST_MONTH = "First Invoice Month" +PI_INITIAL_CREDITS = "Initial Credits" +PI_1ST_USED = "1st Month Used" +PI_2ND_USED = "2nd Month Used" +### + ### Invoice field names INVOICE_DATE_FIELD = "Invoice Month" PROJECT_FIELD = "Project - Allocation" diff --git a/process_report/process_report.py b/process_report/process_report.py index 58ef2a1..ca41632 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -9,7 +9,11 @@ import boto3 import pyarrow -from process_report.invoices import lenovo_invoice, nonbillable_invoice +from process_report.invoices import ( + lenovo_invoice, + nonbillable_invoice, + billable_invoice, +) ### PI file field names @@ -62,33 +66,6 @@ def load_institute_map() -> dict: return institute_map -def load_old_pis(old_pi_file) -> pandas.DataFrame: - try: - old_pi_df = pandas.read_csv( - old_pi_file, - dtype={ - PI_INITIAL_CREDITS: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - PI_1ST_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - PI_2ND_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - }, - ) - except FileNotFoundError: - sys.exit("Applying credit 0002 failed. Old PI file does not exist") - - return old_pi_df - - -def dump_old_pis(old_pi_file, old_pi_df: pandas.DataFrame): - old_pi_df = old_pi_df.astype( - { - PI_INITIAL_CREDITS: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - PI_1ST_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - PI_2ND_USED: pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - }, - ) - old_pi_df.to_csv(old_pi_file, index=False) - - def load_alias(alias_file): alias_dict = dict() @@ -104,31 +81,6 @@ def load_alias(alias_file): return alias_dict -def get_pi_age(old_pi_df: pandas.DataFrame, pi, invoice_month): - """Returns time difference between current invoice month and PI's first invoice month - I.e 0 for new PIs - - Will raise an error if the PI'a age is negative, which suggests a faulty invoice, or a program bug""" - first_invoice_month = old_pi_df.loc[old_pi_df[PI_PI_FIELD] == pi, PI_FIRST_MONTH] - if first_invoice_month.empty: - return 0 - - month_diff = get_month_diff(invoice_month, first_invoice_month.iat[0]) - if month_diff < 0: - sys.exit( - f"PI {pi} from {first_invoice_month} found in {invoice_month} invoice!" - ) - else: - return month_diff - - -def get_month_diff(month_1, month_2): - """Returns a positive integer if month_1 is ahead in time of month_2""" - dt1 = datetime.datetime.strptime(month_1, "%Y-%m") - dt2 = datetime.datetime.strptime(month_2, "%Y-%m") - return (dt1.year - dt2.year) * 12 + (dt1.month - dt2.month) - - def get_invoice_bucket(): try: s3_resource = boto3.resource( @@ -297,22 +249,29 @@ def main(): bucket = get_invoice_bucket() invoice.export_s3(bucket) - billable_projects = remove_non_billables(merged_dataframe, pi, projects) - billable_projects = validate_pi_names(billable_projects) - if args.upload_to_s3: backup_to_s3_old_pi_file(old_pi_file) - credited_projects = apply_credits_new_pi(billable_projects, old_pi_file) - export_billables(credited_projects, args.output_file) - export_pi_billables(credited_projects, args.output_folder, invoice_month) - export_BU_only(billable_projects, args.BU_invoice_file, args.BU_subsidy_amount) - export_HU_BU(credited_projects, args.HU_BU_invoice_file) + billable_inv = billable_invoice.BillableInvoice( + name=args.nonbillable_file, + invoice_month=invoice_month, + data=merged_dataframe.copy(), + nonbillable_pis=pi, + nonbillable_projects=projects, + old_pi_filepath=old_pi_file, + ) + billable_inv.process() + billable_inv.export() + if args.upload_to_s3: + bucket = get_invoice_bucket() + billable_inv.export_s3(bucket) + + export_pi_billables(billable_inv.data, args.output_folder, invoice_month) + export_BU_only(billable_inv.data, args.BU_invoice_file, args.BU_subsidy_amount) + export_HU_BU(billable_inv.data, args.HU_BU_invoice_file) if args.upload_to_s3: - invoice_list = [ - args.output_file, - ] + invoice_list = list() for pi_invoice in os.listdir(args.output_folder): invoice_list.append(os.path.join(args.output_folder, pi_invoice)) @@ -377,23 +336,6 @@ def timed_projects(timed_projects_file, invoice_date): return dataframe[mask]["Project"].to_list() -def remove_non_billables(dataframe, pi, projects): - """Removes projects and PIs that should not be billed from the dataframe""" - filtered_dataframe = dataframe[ - ~dataframe[PI_FIELD].isin(pi) & ~dataframe[PROJECT_FIELD].isin(projects) - ] - return filtered_dataframe - - -def validate_pi_names(dataframe): - invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])] - for i, row in invalid_pi_projects.iterrows(): - print(f"Warning: Billable project {row[PROJECT_FIELD]} has empty PI field") - dataframe = dataframe[~pandas.isna(dataframe[PI_FIELD])] - - return dataframe - - def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict): for pi, pi_aliases in alias_dict.items(): dataframe.loc[dataframe[PI_FIELD].isin(pi_aliases), PI_FIELD] = pi @@ -408,87 +350,6 @@ def fetch_s3_alias_file(): return local_name -def apply_credits_new_pi(dataframe, old_pi_file): - new_pi_credit_code = "0002" - INITIAL_CREDIT_AMOUNT = 1000 - EXCLUDE_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] - - dataframe[CREDIT_FIELD] = None - dataframe[CREDIT_CODE_FIELD] = None - dataframe[BALANCE_FIELD] = Decimal(0) - - old_pi_df = load_old_pis(old_pi_file) - - current_pi_set = set(dataframe[PI_FIELD]) - invoice_month = dataframe[INVOICE_DATE_FIELD].iat[0] - invoice_pis = old_pi_df[old_pi_df[PI_FIRST_MONTH] == invoice_month] - if invoice_pis[PI_INITIAL_CREDITS].empty or pandas.isna( - new_pi_credit_amount := invoice_pis[PI_INITIAL_CREDITS].iat[0] - ): - new_pi_credit_amount = INITIAL_CREDIT_AMOUNT - - print(f"New PI Credit set at {new_pi_credit_amount} for {invoice_month}") - - for pi in current_pi_set: - pi_projects = dataframe[dataframe[PI_FIELD] == pi] - pi_age = get_pi_age(old_pi_df, pi, invoice_month) - pi_old_pi_entry = old_pi_df.loc[old_pi_df[PI_PI_FIELD] == pi].squeeze() - - if pi_age > 1: - for i, row in pi_projects.iterrows(): - dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - else: - if pi_age == 0: - if len(pi_old_pi_entry) == 0: - pi_entry = [pi, invoice_month, new_pi_credit_amount, 0, 0] - old_pi_df = pandas.concat( - [ - pandas.DataFrame([pi_entry], columns=old_pi_df.columns), - old_pi_df, - ], - ignore_index=True, - ) - pi_old_pi_entry = old_pi_df.loc[ - old_pi_df[PI_PI_FIELD] == pi - ].squeeze() - - remaining_credit = new_pi_credit_amount - credit_used_field = PI_1ST_USED - elif pi_age == 1: - remaining_credit = ( - pi_old_pi_entry[PI_INITIAL_CREDITS] - pi_old_pi_entry[PI_1ST_USED] - ) - credit_used_field = PI_2ND_USED - - initial_credit = remaining_credit - for i, row in pi_projects.iterrows(): - if remaining_credit == 0 or row[SU_TYPE_FIELD] in EXCLUDE_SU_TYPES: - dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - else: - project_cost = row[COST_FIELD] - applied_credit = min(project_cost, remaining_credit) - - dataframe.at[i, CREDIT_FIELD] = applied_credit - dataframe.at[i, CREDIT_CODE_FIELD] = new_pi_credit_code - dataframe.at[i, BALANCE_FIELD] = row[COST_FIELD] - applied_credit - remaining_credit -= applied_credit - - credits_used = initial_credit - remaining_credit - if (pi_old_pi_entry[credit_used_field] != 0) and ( - credits_used != pi_old_pi_entry[credit_used_field] - ): - print( - f"Warning: PI file overwritten. PI {pi} previously used ${pi_old_pi_entry[credit_used_field]} of New PI credits, now uses ${credits_used}" - ) - old_pi_df.loc[ - old_pi_df[PI_PI_FIELD] == pi, credit_used_field - ] = credits_used - - dump_old_pis(old_pi_file, old_pi_df) - - return dataframe - - def fetch_s3_old_pi_file(): local_name = "PI.csv" invoice_bucket = get_invoice_bucket() diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index cae5730..2cfc60c 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -3,11 +3,13 @@ import pandas import pyarrow import os +import uuid import math from textwrap import dedent -from process_report import process_report +from process_report import process_report, util from process_report.invoices import lenovo_invoice, nonbillable_invoice +from process_report.tests import util as test_utils class TestGetInvoiceDate(TestCase): @@ -85,30 +87,6 @@ def tearDown(self): os.remove(self.output_file.name) os.remove(self.output_file2.name) - def test_remove_non_billables(self): - billables_df = process_report.remove_non_billables( - self.dataframe, self.pi_to_exclude, self.projects_to_exclude - ) - process_report.export_billables(billables_df, self.output_file.name) - - result_df = pandas.read_csv(self.output_file.name) - - self.assertNotIn("PI2", result_df["Manager (PI)"].tolist()) - self.assertNotIn("PI3", result_df["Manager (PI)"].tolist()) - self.assertNotIn( - "PI4", result_df["Manager (PI)"].tolist() - ) # indirect because ProjectD was removed - self.assertNotIn("ProjectB", result_df["Project - Allocation"].tolist()) - self.assertNotIn( - "ProjectC", result_df["Project - Allocation"].tolist() - ) # indirect because PI3 was removed - self.assertNotIn("ProjectD", result_df["Project - Allocation"].tolist()) - - self.assertIn("PI1", result_df["Manager (PI)"].tolist()) - self.assertIn("PI5", result_df["Manager (PI)"].tolist()) - self.assertIn("ProjectA", result_df["Project - Allocation"].tolist()) - self.assertIn("ProjectE", result_df["Project - Allocation"].tolist()) - def test_remove_billables(self): self.nonbillable_invoice.process() result_df = self.nonbillable_invoice.data @@ -126,6 +104,26 @@ def test_remove_billables(self): self.assertNotIn("ProjectE", result_df["Project - Allocation"].tolist()) +class TestBillableInvoice(TestCase): + def test_remove_nonbillables(self): + pis = [uuid.uuid4().hex for x in range(10)] + projects = [uuid.uuid4().hex for x in range(10)] + nonbillable_pis = pis[:3] + nonbillable_projects = projects[7:] + billable_pis = pis[3:7] + data = pandas.DataFrame({"Manager (PI)": pis, "Project - Allocation": projects}) + + test_invoice = test_utils.new_billable_invoice() + data = test_invoice._remove_nonbillables( + data, nonbillable_pis, nonbillable_projects + ) + self.assertTrue(data[data["Manager (PI)"].isin(nonbillable_pis)].empty) + self.assertTrue( + data[data["Project - Allocation"].isin(nonbillable_projects)].empty + ) + self.assertTrue(data.equals(data[data["Manager (PI)"].isin(billable_pis)])) + + class TestMergeCSV(TestCase): def setUp(self): self.header = ["ID", "Name", "Age"] @@ -287,9 +285,9 @@ def test_get_month_diff(self): (("2024-12", "2025-03"), -3), ] for arglist, answer in testcases: - self.assertEqual(process_report.get_month_diff(*arglist), answer) + self.assertEqual(util.get_month_diff(*arglist), answer) with self.assertRaises(ValueError): - process_report.get_month_diff("2024-16", "2025-03") + util.get_month_diff("2024-16", "2025-03") class TestCredit0002(TestCase): @@ -553,26 +551,21 @@ def tearDown(self): os.remove(self.old_pi_no_gpu_file) def test_apply_credit_0002(self): - dataframe = process_report.apply_credits_new_pi( - self.dataframe, self.old_pi_file + test_invoice = test_utils.new_billable_invoice() + old_pi_df = test_invoice._load_old_pis(self.old_pi_file) + dataframe, updated_old_pi_df = test_invoice._apply_credits_new_pi( + self.dataframe, old_pi_df ) dataframe = dataframe.astype({"Credit": "float64", "Balance": "int64"}) + updated_old_pi_df = updated_old_pi_df.sort_values(by="PI", ignore_index=True) self.assertTrue(self.answer_dataframe.equals(dataframe)) - - old_pi_df_output = pandas.read_csv( - self.old_pi_file, - dtype={ - "Initial Credits": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - "1st Month Used": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - "2nd Month Used": pandas.ArrowDtype(pyarrow.decimal128(21, 2)), - }, - ).sort_values(by=["PI"], ignore_index=True) - - self.assertTrue(old_pi_df_output.equals(self.old_pi_df_answer)) + self.assertTrue(self.old_pi_df_answer.equals(updated_old_pi_df)) def test_no_gpu(self): - dataframe = process_report.apply_credits_new_pi( - self.dataframe_no_gpu, self.old_pi_no_gpu_file + test_invoice = test_utils.new_billable_invoice() + old_pi_df = test_invoice._load_old_pis(self.old_pi_no_gpu_file) + dataframe, _ = test_invoice._apply_credits_new_pi( + self.dataframe_no_gpu, old_pi_df ) dataframe = dataframe.astype({"Credit": "float64", "Balance": "float64"}) self.assertTrue(self.no_gpu_df_answer.equals(dataframe)) @@ -582,8 +575,9 @@ def test_apply_credit_error(self): {"PI": ["PI1"], "First Invoice Month": ["2024-04"]} ) invoice_month = "2024-03" + test_invoice = test_utils.new_billable_invoice() with self.assertRaises(SystemExit): - process_report.get_pi_age(old_pi_df, "PI1", invoice_month) + test_invoice._get_pi_age(old_pi_df, "PI1", invoice_month) class TestBUSubsidy(TestCase): @@ -716,7 +710,8 @@ def test_validate_billables(self): self.assertEqual( 1, len(self.dataframe[pandas.isna(self.dataframe["Manager (PI)"])]) ) - validated_df = process_report.validate_pi_names(self.dataframe) + test_invoice = test_utils.new_billable_invoice() + validated_df = test_invoice._validate_pi_names(self.dataframe) self.assertEqual( 0, len(validated_df[pandas.isna(validated_df["Manager (PI)"])]) ) diff --git a/process_report/tests/util.py b/process_report/tests/util.py new file mode 100644 index 0000000..2a04251 --- /dev/null +++ b/process_report/tests/util.py @@ -0,0 +1,21 @@ +import pandas + +from process_report.invoices import billable_invoice + + +def new_billable_invoice( + name="", + invoice_month="0000-00", + data=pandas.DataFrame(), + nonbillable_pis=[], + nonbillable_projects=[], + old_pi_filepath="", +): + return billable_invoice.BillableInvoice( + name, + invoice_month, + data, + nonbillable_pis, + nonbillable_projects, + old_pi_filepath, + ) diff --git a/process_report/util.py b/process_report/util.py index 0853ed0..e6d2f21 100644 --- a/process_report/util.py +++ b/process_report/util.py @@ -33,3 +33,10 @@ def compare_invoice_month(month_1, month_2): dt1 = datetime.datetime.strptime(month_1, "%Y-%m") dt2 = datetime.datetime.strptime(month_2, "%Y-%m") return dt1 > dt2 + + +def get_month_diff(month_1, month_2): + """Returns a positive integer if month_1 is ahead in time of month_2""" + dt1 = datetime.datetime.strptime(month_1, "%Y-%m") + dt2 = datetime.datetime.strptime(month_2, "%Y-%m") + return (dt1.year - dt2.year) * 12 + (dt1.month - dt2.month)