From c09fd989e993174f0fe9339b1a4df025f9a6c915 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Tue, 30 Jul 2024 09:59:40 -0400 Subject: [PATCH 1/2] Implemented BU-Internal invoice as an Invoice class and copied over code into `bu_internal_invoice.py` This is the prerequisite step before more detailed refactoring of the BU-Internal invoice --- .../invoices/bu_internal_invoice.py | 68 +++++++++++++++++ process_report/process_report.py | 74 ++++--------------- process_report/tests/unit_tests.py | 9 ++- process_report/tests/util.py | 10 ++- 4 files changed, 96 insertions(+), 65 deletions(-) create mode 100644 process_report/invoices/bu_internal_invoice.py diff --git a/process_report/invoices/bu_internal_invoice.py b/process_report/invoices/bu_internal_invoice.py new file mode 100644 index 0000000..c4b772e --- /dev/null +++ b/process_report/invoices/bu_internal_invoice.py @@ -0,0 +1,68 @@ +from dataclasses import dataclass +from decimal import Decimal + +import process_report.invoices.invoice as invoice + + +@dataclass +class BUInternalInvoice(invoice.Invoice): + subsidy_amount: int + + def _prepare(self): + def get_project(row): + project_alloc = row[invoice.PROJECT_FIELD] + if project_alloc.rfind("-") == -1: + return project_alloc + else: + return project_alloc[: project_alloc.rfind("-")] + + self.data = self.data[ + self.data[invoice.INSTITUTION_FIELD] == "Boston University" + ].copy() + self.data["Project"] = self.data.apply(get_project, axis=1) + self.data[invoice.SUBSIDY_FIELD] = Decimal(0) + self.data = self.data[ + [ + invoice.INVOICE_DATE_FIELD, + invoice.PI_FIELD, + "Project", + invoice.COST_FIELD, + invoice.CREDIT_FIELD, + invoice.SUBSIDY_FIELD, + invoice.BALANCE_FIELD, + ] + ] + + def _process(self): + project_list = self.data["Project"].unique() + data_no_dup = self.data.drop_duplicates("Project", inplace=False) + sum_fields = [invoice.COST_FIELD, invoice.CREDIT_FIELD, invoice.BALANCE_FIELD] + for project in project_list: + project_mask = self.data["Project"] == project + no_dup_project_mask = data_no_dup["Project"] == project + + sum_fields_sums = self.data[project_mask][sum_fields].sum().values + data_no_dup.loc[no_dup_project_mask, sum_fields] = sum_fields_sums + + self.data = self._apply_subsidy(data_no_dup, self.subsidy_amount) + + def _apply_subsidy(self, dataframe, subsidy_amount): + pi_list = dataframe[invoice.PI_FIELD].unique() + + for pi in pi_list: + pi_projects = dataframe[dataframe[invoice.PI_FIELD] == pi] + remaining_subsidy = subsidy_amount + for i, row in pi_projects.iterrows(): + project_remaining_cost = row[invoice.BALANCE_FIELD] + applied_subsidy = min(project_remaining_cost, remaining_subsidy) + + dataframe.at[i, invoice.SUBSIDY_FIELD] = applied_subsidy + dataframe.at[i, invoice.BALANCE_FIELD] = ( + row[invoice.BALANCE_FIELD] - applied_subsidy + ) + remaining_subsidy -= applied_subsidy + + if remaining_subsidy == 0: + break + + return dataframe diff --git a/process_report/process_report.py b/process_report/process_report.py index 8797b53..7d62083 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -3,7 +3,6 @@ import sys import datetime import functools -from decimal import Decimal import json import pandas @@ -15,6 +14,7 @@ nonbillable_invoice, billable_invoice, NERC_total_invoice, + bu_internal_invoice, ) @@ -169,7 +169,7 @@ def main(): parser.add_argument( "--BU-invoice-file", required=False, - default="BU_Internal.csv", + default="BU_Internal", help="Name of output csv for BU invoices", ) parser.add_argument( @@ -283,8 +283,19 @@ def main(): bucket = get_invoice_bucket() invoice.export_s3(bucket) + bu_internal_inv = bu_internal_invoice.BUInternalInvoice( + name=args.BU_invoice_file, + invoice_month=invoice_month, + data=billable_inv.data, + subsidy_amount=args.BU_subsidy_amount, + ) + bu_internal_inv.process() + bu_internal_inv.export() + if args.upload_to_s3: + bucket = get_invoice_bucket() + bu_internal_inv.export_s3(bucket) + export_pi_billables(billable_inv.data, args.output_folder, invoice_month) - export_BU_only(billable_inv.data, args.BU_invoice_file, args.BU_subsidy_amount) if args.upload_to_s3: invoice_list = list() @@ -426,63 +437,6 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_mont ) -def export_BU_only(dataframe: pandas.DataFrame, output_file, subsidy_amount): - def get_project(row): - project_alloc = row[PROJECT_FIELD] - if project_alloc.rfind("-") == -1: - return project_alloc - else: - return project_alloc[: project_alloc.rfind("-")] - - BU_projects = dataframe[dataframe[INSTITUTION_FIELD] == "Boston University"].copy() - BU_projects["Project"] = BU_projects.apply(get_project, axis=1) - BU_projects[SUBSIDY_FIELD] = Decimal(0) - BU_projects = BU_projects[ - [ - INVOICE_DATE_FIELD, - PI_FIELD, - "Project", - COST_FIELD, - CREDIT_FIELD, - SUBSIDY_FIELD, - BALANCE_FIELD, - ] - ] - - project_list = BU_projects["Project"].unique() - BU_projects_no_dup = BU_projects.drop_duplicates("Project", inplace=False) - sum_fields = [COST_FIELD, CREDIT_FIELD, BALANCE_FIELD] - for project in project_list: - project_mask = BU_projects["Project"] == project - no_dup_project_mask = BU_projects_no_dup["Project"] == project - - sum_fields_sums = BU_projects[project_mask][sum_fields].sum().values - BU_projects_no_dup.loc[no_dup_project_mask, sum_fields] = sum_fields_sums - - BU_projects_no_dup = _apply_subsidy(BU_projects_no_dup, subsidy_amount) - BU_projects_no_dup.to_csv(output_file, index=False) - - -def _apply_subsidy(dataframe, subsidy_amount): - pi_list = dataframe[PI_FIELD].unique() - - for pi in pi_list: - pi_projects = dataframe[dataframe[PI_FIELD] == pi] - remaining_subsidy = subsidy_amount - for i, row in pi_projects.iterrows(): - project_remaining_cost = row[BALANCE_FIELD] - applied_subsidy = min(project_remaining_cost, remaining_subsidy) - - dataframe.at[i, SUBSIDY_FIELD] = applied_subsidy - dataframe.at[i, BALANCE_FIELD] = row[BALANCE_FIELD] - applied_subsidy - remaining_subsidy -= applied_subsidy - - if remaining_subsidy == 0: - break - - return dataframe - - def upload_to_s3(invoice_list: list, invoice_month): invoice_bucket = get_invoice_bucket() for invoice_filename in invoice_list: diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 117412d..3713c84 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -648,13 +648,14 @@ def setUp(self): ], # Test case where subsidy does/doesn't cover fully balance } self.dataframe = pandas.DataFrame(data) - output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv") - self.output_file = output_file.name self.subsidy = 100 def test_apply_BU_subsidy(self): - process_report.export_BU_only(self.dataframe, self.output_file, self.subsidy) - output_df = pandas.read_csv(self.output_file) + test_invoice = test_utils.new_bu_internal_invoice( + data=self.dataframe, subsidy_amount=self.subsidy + ) + test_invoice.process() + output_df = test_invoice.data.reset_index() self.assertTrue( set( diff --git a/process_report/tests/util.py b/process_report/tests/util.py index 2a04251..0d3d3d6 100644 --- a/process_report/tests/util.py +++ b/process_report/tests/util.py @@ -1,6 +1,6 @@ import pandas -from process_report.invoices import billable_invoice +from process_report.invoices import billable_invoice, bu_internal_invoice def new_billable_invoice( @@ -19,3 +19,11 @@ def new_billable_invoice( nonbillable_projects, old_pi_filepath, ) + + +def new_bu_internal_invoice( + name="", invoice_month="0000-00", data=pandas.DataFrame(), subsidy_amount=0 +): + return bu_internal_invoice.BUInternalInvoice( + name, invoice_month, data, subsidy_amount + ) From eb2d471a6e03ac9c1e57e26974ad22181aee0d52 Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Tue, 30 Jul 2024 10:23:18 -0400 Subject: [PATCH 2/2] Implemented discount_invoice to refactor BU-Internal and billable invoice The BU-Internal and billable invoice now subclasses from `discount_invoice`, an invoice class which implements a function to apply a flat discount on a PI's projects. This reduces some code redundancy since the New-PI credit and the BU subsidy share some similar logic. Additional smaller changes is done to improve code readability. --- process_report/invoices/billable_invoice.py | 43 +++++----- .../invoices/bu_internal_invoice.py | 40 +++++----- process_report/invoices/discount_invoice.py | 79 +++++++++++++++++++ process_report/process_report.py | 51 +++--------- process_report/tests/unit_tests.py | 5 +- process_report/util.py | 29 +++++++ 6 files changed, 159 insertions(+), 88 deletions(-) create mode 100644 process_report/invoices/discount_invoice.py diff --git a/process_report/invoices/billable_invoice.py b/process_report/invoices/billable_invoice.py index 7cf319b..0cf7d3e 100644 --- a/process_report/invoices/billable_invoice.py +++ b/process_report/invoices/billable_invoice.py @@ -1,13 +1,12 @@ from dataclasses import dataclass -from decimal import Decimal import logging import sys import pandas import pyarrow -import process_report.invoices.invoice as invoice -import process_report.util as util +from process_report.invoices import invoice, discount_invoice +from process_report import util logger = logging.getLogger(__name__) @@ -15,7 +14,7 @@ @dataclass -class BillableInvoice(invoice.Invoice): +class BillableInvoice(discount_invoice.DiscountInvoice): NEW_PI_CREDIT_CODE = "0002" INITIAL_CREDIT_AMOUNT = 1000 EXCLUDE_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"] @@ -89,7 +88,7 @@ def _prepare(self): self.data = self._validate_pi_names(self.data) self.data[invoice.CREDIT_FIELD] = None self.data[invoice.CREDIT_CODE_FIELD] = None - self.data[invoice.BALANCE_FIELD] = Decimal(0) + self.data[invoice.BALANCE_FIELD] = self.data[invoice.COST_FIELD] self.old_pi_df = self._load_old_pis(self.old_pi_filepath) def _process(self): @@ -143,14 +142,17 @@ def get_initial_credit_amount( current_pi_set = set(data[invoice.PI_FIELD]) for pi in current_pi_set: - pi_projects = data[data[invoice.PI_FIELD] == pi] + credit_eligible_projects = data[ + (data[invoice.PI_FIELD] == pi) + & ~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES)) + ] pi_age = self._get_pi_age(old_pi_df, pi, self.invoice_month) pi_old_pi_entry = old_pi_df.loc[ old_pi_df[invoice.PI_PI_FIELD] == pi ].squeeze() if pi_age > 1: - for i, row in pi_projects.iterrows(): + for i, row in credit_eligible_projects.iterrows(): data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD] else: if pi_age == 0: @@ -176,25 +178,16 @@ def get_initial_credit_amount( ) credit_used_field = invoice.PI_2ND_USED - initial_credit = remaining_credit - for i, row in pi_projects.iterrows(): - if ( - remaining_credit == 0 - or row[invoice.SU_TYPE_FIELD] in self.EXCLUDE_SU_TYPES - ): - data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD] - else: - project_cost = row[invoice.COST_FIELD] - applied_credit = min(project_cost, remaining_credit) - - data.at[i, invoice.CREDIT_FIELD] = applied_credit - data.at[i, invoice.CREDIT_CODE_FIELD] = self.NEW_PI_CREDIT_CODE - data.at[i, invoice.BALANCE_FIELD] = ( - row[invoice.COST_FIELD] - applied_credit - ) - remaining_credit -= applied_credit + credits_used = self.apply_flat_discount( + data, + credit_eligible_projects, + remaining_credit, + invoice.CREDIT_FIELD, + invoice.BALANCE_FIELD, + invoice.CREDIT_CODE_FIELD, + self.NEW_PI_CREDIT_CODE, + ) - credits_used = initial_credit - remaining_credit if (pi_old_pi_entry[credit_used_field] != 0) and ( credits_used != pi_old_pi_entry[credit_used_field] ): diff --git a/process_report/invoices/bu_internal_invoice.py b/process_report/invoices/bu_internal_invoice.py index c4b772e..8226b97 100644 --- a/process_report/invoices/bu_internal_invoice.py +++ b/process_report/invoices/bu_internal_invoice.py @@ -2,10 +2,11 @@ from decimal import Decimal import process_report.invoices.invoice as invoice +import process_report.invoices.discount_invoice as discount_invoice @dataclass -class BUInternalInvoice(invoice.Invoice): +class BUInternalInvoice(discount_invoice.DiscountInvoice): subsidy_amount: int def _prepare(self): @@ -34,35 +35,36 @@ def get_project(row): ] def _process(self): - project_list = self.data["Project"].unique() - data_no_dup = self.data.drop_duplicates("Project", inplace=False) + data_summed_projects = self._sum_project_allocations(self.data) + self.data = self._apply_subsidy(data_summed_projects, self.subsidy_amount) + + def _sum_project_allocations(self, dataframe): + """A project may have multiple allocations, and therefore multiple rows + in the raw invoices. For BU-Internal invoice, we only want 1 row for + each unique project, summing up its allocations' costs""" + project_list = dataframe["Project"].unique() + data_no_dup = dataframe.drop_duplicates("Project", inplace=False) sum_fields = [invoice.COST_FIELD, invoice.CREDIT_FIELD, invoice.BALANCE_FIELD] for project in project_list: - project_mask = self.data["Project"] == project + project_mask = dataframe["Project"] == project no_dup_project_mask = data_no_dup["Project"] == project - sum_fields_sums = self.data[project_mask][sum_fields].sum().values + sum_fields_sums = dataframe[project_mask][sum_fields].sum().values data_no_dup.loc[no_dup_project_mask, sum_fields] = sum_fields_sums - self.data = self._apply_subsidy(data_no_dup, self.subsidy_amount) + return data_no_dup def _apply_subsidy(self, dataframe, subsidy_amount): pi_list = dataframe[invoice.PI_FIELD].unique() for pi in pi_list: pi_projects = dataframe[dataframe[invoice.PI_FIELD] == pi] - remaining_subsidy = subsidy_amount - for i, row in pi_projects.iterrows(): - project_remaining_cost = row[invoice.BALANCE_FIELD] - applied_subsidy = min(project_remaining_cost, remaining_subsidy) - - dataframe.at[i, invoice.SUBSIDY_FIELD] = applied_subsidy - dataframe.at[i, invoice.BALANCE_FIELD] = ( - row[invoice.BALANCE_FIELD] - applied_subsidy - ) - remaining_subsidy -= applied_subsidy - - if remaining_subsidy == 0: - break + self.apply_flat_discount( + dataframe, + pi_projects, + subsidy_amount, + invoice.SUBSIDY_FIELD, + invoice.BALANCE_FIELD, + ) return dataframe diff --git a/process_report/invoices/discount_invoice.py b/process_report/invoices/discount_invoice.py new file mode 100644 index 0000000..333b998 --- /dev/null +++ b/process_report/invoices/discount_invoice.py @@ -0,0 +1,79 @@ +from dataclasses import dataclass + +import pandas + +import process_report.invoices.invoice as invoice + + +@dataclass +class DiscountInvoice(invoice.Invoice): + """ + Invoice class containing functions useful for applying discounts + on dataframes + """ + + @staticmethod + def apply_flat_discount( + invoice: pandas.DataFrame, + pi_projects: pandas.DataFrame, + discount_amount: int, + discount_field: str, + balance_field: str, + code_field: str = None, + discount_code: str = None, + ): + """ + Takes in an invoice and a list of PI projects that are a subset of it, + and applies a flat discount to those PI projects. Note that this function + will change the provided `invoice` Dataframe directly. Therefore, it does + not return the changed invoice. + + This function assumes that the balance field shows the remaining cost of the project, + or what the PI would pay before the flat discount is applied. + + If the optional parameters `code_field` and `discount_code` are passed in, + `discount_code` will be comma-APPENDED to the `code_field` of projects where + the discount is applied + + Returns the amount of discount used. + + :param invoice: Dataframe containing all projects + :param pi_projects: A subset of `invoice`, containing all projects for a PI you want to apply the discount + :param discount_amount: The discount given to the PI + :param discount_field: Name of the field to put the discount amount applied to each project + :param balance_field: Name of the balance field + :param code_field: Name of the discount code field + :param discount_code: Code of the discount + """ + + def apply_discount_on_project(remaining_discount_amount, project_i, project): + remaining_project_balance = project[balance_field] + applied_discount = min(remaining_project_balance, remaining_discount_amount) + invoice.at[project_i, discount_field] = applied_discount + invoice.at[project_i, balance_field] = ( + project[balance_field] - applied_discount + ) + remaining_discount_amount -= applied_discount + return remaining_discount_amount + + def apply_credit_code_on_project(project_i): + if code_field and discount_code: + if pandas.isna(invoice.at[project_i, code_field]): + invoice.at[project_i, code_field] = discount_code + else: + invoice.at[project_i, code_field] = ( + invoice.at[project_i, code_field] + "," + discount_code + ) + + remaining_discount_amount = discount_amount + for i, row in pi_projects.iterrows(): + if remaining_discount_amount == 0: + break + else: + remaining_discount_amount = apply_discount_on_project( + remaining_discount_amount, i, row + ) + apply_credit_code_on_project(i) + + discount_used = discount_amount - remaining_discount_amount + return discount_used diff --git a/process_report/process_report.py b/process_report/process_report.py index 7d62083..3f309a2 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -2,13 +2,12 @@ import os import sys import datetime -import functools import json import pandas -import boto3 import pyarrow +from process_report.util import get_invoice_bucket, process_and_export_invoices from process_report.invoices import ( lenovo_invoice, nonbillable_invoice, @@ -87,22 +86,6 @@ def load_alias(alias_file): return alias_dict -@functools.lru_cache -def get_invoice_bucket(): - try: - s3_resource = boto3.resource( - service_name="s3", - endpoint_url=os.environ.get( - "S3_ENDPOINT", "https://s3.us-east-005.backblazeb2.com" - ), - aws_access_key_id=os.environ["S3_KEY_ID"], - aws_secret_access_key=os.environ["S3_APP_KEY"], - ) - except KeyError: - print("Error: Please set the environment variables S3_KEY_ID and S3_APP_KEY") - return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing")) - - def get_iso8601_time(): return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ") @@ -249,12 +232,6 @@ def main(): nonbillable_pis=pi, nonbillable_projects=projects, ) - for invoice in [lenovo_inv, nonbillable_inv]: - invoice.process() - invoice.export() - if args.upload_to_s3: - bucket = get_invoice_bucket() - invoice.export_s3(bucket) if args.upload_to_s3: backup_to_s3_old_pi_file(old_pi_file) @@ -267,35 +244,27 @@ def main(): nonbillable_projects=projects, old_pi_filepath=old_pi_file, ) - billable_inv.process() - billable_inv.export() + + process_and_export_invoices( + [lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3 + ) nerc_total_inv = NERC_total_invoice.NERCTotalInvoice( name=args.NERC_total_invoice_file, invoice_month=invoice_month, - data=billable_inv.data, + data=billable_inv.data.copy(), ) - nerc_total_inv.process() - nerc_total_inv.export() - - if args.upload_to_s3: - for invoice in [billable_inv, nerc_total_inv]: - bucket = get_invoice_bucket() - invoice.export_s3(bucket) bu_internal_inv = bu_internal_invoice.BUInternalInvoice( name=args.BU_invoice_file, invoice_month=invoice_month, - data=billable_inv.data, + data=billable_inv.data.copy(), subsidy_amount=args.BU_subsidy_amount, ) - bu_internal_inv.process() - bu_internal_inv.export() - if args.upload_to_s3: - bucket = get_invoice_bucket() - bu_internal_inv.export_s3(bucket) - export_pi_billables(billable_inv.data, args.output_folder, invoice_month) + process_and_export_invoices([nerc_total_inv, bu_internal_inv], args.upload_to_s3) + + export_pi_billables(billable_inv.data.copy(), args.output_folder, invoice_month) if args.upload_to_s3: invoice_list = list() diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 3713c84..411b55a 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -5,7 +5,6 @@ import os import uuid import math -from decimal import Decimal from textwrap import dedent from process_report import process_report, util @@ -419,7 +418,7 @@ def setUp(self): self.dataframe = pandas.DataFrame(data) self.dataframe["Credit"] = None self.dataframe["Credit Code"] = None - self.dataframe["Balance"] = Decimal(0) + self.dataframe["Balance"] = self.dataframe["Cost"] self.answer_dataframe = pandas.DataFrame(answer_df_dict) old_pi = [ "PI,First Invoice Month,Initial Credits,1st Month Used,2nd Month Used", @@ -516,7 +515,7 @@ def setUp(self): ) self.dataframe_no_gpu["Credit"] = None self.dataframe_no_gpu["Credit Code"] = None - self.dataframe_no_gpu["Balance"] = Decimal(0) + self.dataframe_no_gpu["Balance"] = self.dataframe_no_gpu["Cost"] old_pi_no_gpu = [ "PI,First Invoice Month,Initial Credits,1st Month Used,2nd Month Used", "OldPI,2024-03,500,200,0", diff --git a/process_report/util.py b/process_report/util.py index e6d2f21..45b9ed4 100644 --- a/process_report/util.py +++ b/process_report/util.py @@ -1,12 +1,32 @@ +import os import datetime import json import logging +import functools + +import boto3 logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) +@functools.lru_cache +def get_invoice_bucket(): + try: + s3_resource = boto3.resource( + service_name="s3", + endpoint_url=os.environ.get( + "S3_ENDPOINT", "https://s3.us-east-005.backblazeb2.com" + ), + aws_access_key_id=os.environ["S3_KEY_ID"], + aws_secret_access_key=os.environ["S3_APP_KEY"], + ) + except KeyError: + print("Error: Please set the environment variables S3_KEY_ID and S3_APP_KEY") + return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing")) + + def get_institution_from_pi(institute_map, pi_uname): institution_key = pi_uname.split("@")[-1] institution_name = institute_map.get(institution_key, "") @@ -40,3 +60,12 @@ def get_month_diff(month_1, month_2): dt1 = datetime.datetime.strptime(month_1, "%Y-%m") dt2 = datetime.datetime.strptime(month_2, "%Y-%m") return (dt1.year - dt2.year) * 12 + (dt1.month - dt2.month) + + +def process_and_export_invoices(invoice_list, upload_to_s3): + for invoice in invoice_list: + invoice.process() + invoice.export() + if upload_to_s3: + bucket = get_invoice_bucket() + invoice.export_s3(bucket)