From c1ba0d3153c6c3cec2d8b004a27638a8eb4e3d14 Mon Sep 17 00:00:00 2001
From: Quan Pham <qmpham2019@gmail.com>
Date: Mon, 11 Nov 2024 13:20:42 -0500
Subject: [PATCH] Since all current discounts have been implemented as
 processors, the `DiscountInvoice` class is now removed. No invoice class will
 now class `_prepare()` or `_process()`.

`BUSubsidyProcessor`, which handles processing for the BU subsidy,
sets `IS_DISCOUNT_BY_NERC` to `False` because the
subsidy is not provided by NERC. Because of this, `BU Balance` indicates
the money which BU (not the PI they are subsidizing) owes to the
MGHPCC.

The test cases for the BU Subsidy has been refactored to be
more robust and readable.
The `Project` field has been added to `invoice.py`
---
 .../invoices/bu_internal_invoice.py           |  48 +---
 process_report/invoices/discount_invoice.py   |  79 ------
 process_report/invoices/invoice.py            |   1 +
 process_report/process_report.py              |   9 +-
 .../processors/bu_subsidy_processor.py        |  44 ++-
 process_report/tests/unit_tests.py            | 256 +++++++++++-------
 process_report/tests/util.py                  |  21 +-
 7 files changed, 209 insertions(+), 249 deletions(-)
 delete mode 100644 process_report/invoices/discount_invoice.py

diff --git a/process_report/invoices/bu_internal_invoice.py b/process_report/invoices/bu_internal_invoice.py
index 402bfbf..043cadc 100644
--- a/process_report/invoices/bu_internal_invoice.py
+++ b/process_report/invoices/bu_internal_invoice.py
@@ -1,12 +1,10 @@
 from dataclasses import dataclass
-from decimal import Decimal
 
 import process_report.invoices.invoice as invoice
-import process_report.invoices.discount_invoice as discount_invoice
 
 
 @dataclass
-class BUInternalInvoice(discount_invoice.DiscountInvoice):
+class BUInternalInvoice(invoice.Invoice):
     """
     Dependancies:
     - ValidateBillablePIsProcessor
@@ -20,31 +18,19 @@ class BUInternalInvoice(discount_invoice.DiscountInvoice):
         invoice.COST_FIELD,
         invoice.CREDIT_FIELD,
         invoice.SUBSIDY_FIELD,
-        invoice.BALANCE_FIELD,
+        invoice.PI_BALANCE_FIELD,
     ]
 
-    subsidy_amount: int
-
-    def _prepare(self):
-        def get_project(row):
-            project_alloc = row[invoice.PROJECT_FIELD]
-            if project_alloc.rfind("-") == -1:
-                return project_alloc
-            else:
-                return project_alloc[: project_alloc.rfind("-")]
+    exported_columns_map = {invoice.PI_BALANCE_FIELD: "Balance"}
 
+    def _prepare_export(self):
         self.data = self.data[
             self.data[invoice.IS_BILLABLE_FIELD] & ~self.data[invoice.MISSING_PI_FIELD]
         ]
         self.data = self.data[
             self.data[invoice.INSTITUTION_FIELD] == "Boston University"
-        ].copy()
-        self.data["Project"] = self.data.apply(get_project, axis=1)
-        self.data[invoice.SUBSIDY_FIELD] = Decimal(0)
-
-    def _process(self):
-        data_summed_projects = self._sum_project_allocations(self.data)
-        self.data = self._apply_subsidy(data_summed_projects, self.subsidy_amount)
+        ]
+        self.data = self._sum_project_allocations(self.data)
 
     def _sum_project_allocations(self, dataframe):
         """A project may have multiple allocations, and therefore multiple rows
@@ -52,7 +38,12 @@ def _sum_project_allocations(self, dataframe):
         each unique project, summing up its allocations' costs"""
         project_list = dataframe["Project"].unique()
         data_no_dup = dataframe.drop_duplicates("Project", inplace=False)
-        sum_fields = [invoice.COST_FIELD, invoice.CREDIT_FIELD, invoice.BALANCE_FIELD]
+        sum_fields = [
+            invoice.COST_FIELD,
+            invoice.CREDIT_FIELD,
+            invoice.SUBSIDY_FIELD,
+            invoice.PI_BALANCE_FIELD,
+        ]
         for project in project_list:
             project_mask = dataframe["Project"] == project
             no_dup_project_mask = data_no_dup["Project"] == project
@@ -61,18 +52,3 @@ def _sum_project_allocations(self, dataframe):
             data_no_dup.loc[no_dup_project_mask, sum_fields] = sum_fields_sums
 
         return data_no_dup
-
-    def _apply_subsidy(self, dataframe, subsidy_amount):
-        pi_list = dataframe[invoice.PI_FIELD].unique()
-
-        for pi in pi_list:
-            pi_projects = dataframe[dataframe[invoice.PI_FIELD] == pi]
-            self.apply_flat_discount(
-                dataframe,
-                pi_projects,
-                subsidy_amount,
-                invoice.SUBSIDY_FIELD,
-                invoice.BALANCE_FIELD,
-            )
-
-        return dataframe
diff --git a/process_report/invoices/discount_invoice.py b/process_report/invoices/discount_invoice.py
deleted file mode 100644
index 333b998..0000000
--- a/process_report/invoices/discount_invoice.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from dataclasses import dataclass
-
-import pandas
-
-import process_report.invoices.invoice as invoice
-
-
-@dataclass
-class DiscountInvoice(invoice.Invoice):
-    """
-    Invoice class containing functions useful for applying discounts
-    on dataframes
-    """
-
-    @staticmethod
-    def apply_flat_discount(
-        invoice: pandas.DataFrame,
-        pi_projects: pandas.DataFrame,
-        discount_amount: int,
-        discount_field: str,
-        balance_field: str,
-        code_field: str = None,
-        discount_code: str = None,
-    ):
-        """
-        Takes in an invoice and a list of PI projects that are a subset of it,
-        and applies a flat discount to those PI projects. Note that this function
-        will change the provided `invoice` Dataframe directly. Therefore, it does
-        not return the changed invoice.
-
-        This function assumes that the balance field shows the remaining cost of the project,
-        or what the PI would pay before the flat discount is applied.
-
-        If the optional parameters `code_field` and `discount_code` are passed in,
-        `discount_code` will be comma-APPENDED to the `code_field` of projects where
-        the discount is applied
-
-        Returns the amount of discount used.
-
-        :param invoice: Dataframe containing all projects
-        :param pi_projects: A subset of `invoice`, containing all projects for a PI you want to apply the discount
-        :param discount_amount: The discount given to the PI
-        :param discount_field: Name of the field to put the discount amount applied to each project
-        :param balance_field: Name of the balance field
-        :param code_field: Name of the discount code field
-        :param discount_code: Code of the discount
-        """
-
-        def apply_discount_on_project(remaining_discount_amount, project_i, project):
-            remaining_project_balance = project[balance_field]
-            applied_discount = min(remaining_project_balance, remaining_discount_amount)
-            invoice.at[project_i, discount_field] = applied_discount
-            invoice.at[project_i, balance_field] = (
-                project[balance_field] - applied_discount
-            )
-            remaining_discount_amount -= applied_discount
-            return remaining_discount_amount
-
-        def apply_credit_code_on_project(project_i):
-            if code_field and discount_code:
-                if pandas.isna(invoice.at[project_i, code_field]):
-                    invoice.at[project_i, code_field] = discount_code
-                else:
-                    invoice.at[project_i, code_field] = (
-                        invoice.at[project_i, code_field] + "," + discount_code
-                    )
-
-        remaining_discount_amount = discount_amount
-        for i, row in pi_projects.iterrows():
-            if remaining_discount_amount == 0:
-                break
-            else:
-                remaining_discount_amount = apply_discount_on_project(
-                    remaining_discount_amount, i, row
-                )
-                apply_credit_code_on_project(i)
-
-        discount_used = discount_amount - remaining_discount_amount
-        return discount_used
diff --git a/process_report/invoices/invoice.py b/process_report/invoices/invoice.py
index 35c8ffb..7ffb6d8 100644
--- a/process_report/invoices/invoice.py
+++ b/process_report/invoices/invoice.py
@@ -37,6 +37,7 @@
 IS_BILLABLE_FIELD = "Is Billable"
 MISSING_PI_FIELD = "Missing PI"
 PI_BALANCE_FIELD = "PI Balance"
+PROJECT_NAME_FIELD = "Project"
 ###
 
 
diff --git a/process_report/process_report.py b/process_report/process_report.py
index cd65430..49b7644 100644
--- a/process_report/process_report.py
+++ b/process_report/process_report.py
@@ -21,6 +21,7 @@
     lenovo_processor,
     validate_billable_pi_processor,
     new_pi_credit_processor,
+    bu_subsidy_processor,
 )
 
 ### PI file field names
@@ -242,7 +243,12 @@ def main():
     )
     new_pi_credit_proc.process()
 
-    processed_data = new_pi_credit_proc.data
+    bu_subsidy_proc = bu_subsidy_processor.BUSubsidyProcessor(
+        "", invoice_month, new_pi_credit_proc.data.copy(), args.BU_subsidy_amount
+    )
+    bu_subsidy_proc.process()
+
+    processed_data = bu_subsidy_proc.data
 
     ### Initialize invoices
 
@@ -280,7 +286,6 @@ def main():
         name=args.BU_invoice_file,
         invoice_month=invoice_month,
         data=processed_data.copy(),
-        subsidy_amount=args.BU_subsidy_amount,
     )
 
     pi_inv = pi_specific_invoice.PIInvoice(
diff --git a/process_report/processors/bu_subsidy_processor.py b/process_report/processors/bu_subsidy_processor.py
index 976b8de..8dc35f7 100644
--- a/process_report/processors/bu_subsidy_processor.py
+++ b/process_report/processors/bu_subsidy_processor.py
@@ -7,6 +7,8 @@
 
 @dataclass
 class BUSubsidyProcessor(discount_processor.DiscountProcessor):
+    IS_DISCOUNT_BY_NERC = False
+
     subsidy_amount: int
 
     def _prepare(self):
@@ -17,43 +19,35 @@ def get_project(row):
             else:
                 return project_alloc[: project_alloc.rfind("-")]
 
-        self.data = self.data[
-            self.data[invoice.IS_BILLABLE_FIELD] & ~self.data[invoice.MISSING_PI_FIELD]
-        ]
-        self.data = self.data[
-            self.data[invoice.INSTITUTION_FIELD] == "Boston University"
-        ].copy()
-        self.data["Project"] = self.data.apply(get_project, axis=1)
+        self.data[invoice.PROJECT_NAME_FIELD] = self.data.apply(get_project, axis=1)
         self.data[invoice.SUBSIDY_FIELD] = Decimal(0)
 
     def _process(self):
-        data_summed_projects = self._sum_project_allocations(self.data)
-        self.data = self._apply_subsidy(data_summed_projects, self.subsidy_amount)
+        self.data = self._apply_subsidy(self.data, self.subsidy_amount)
 
-    def _sum_project_allocations(self, dataframe):
-        """A project may have multiple allocations, and therefore multiple rows
-        in the raw invoices. For BU-Internal invoice, we only want 1 row for
-        each unique project, summing up its allocations' costs"""
-        project_list = dataframe["Project"].unique()
-        data_no_dup = dataframe.drop_duplicates("Project", inplace=False)
-        sum_fields = [invoice.COST_FIELD, invoice.CREDIT_FIELD, invoice.BALANCE_FIELD]
-        for project in project_list:
-            project_mask = dataframe["Project"] == project
-            no_dup_project_mask = data_no_dup["Project"] == project
-
-            sum_fields_sums = dataframe[project_mask][sum_fields].sum().values
-            data_no_dup.loc[no_dup_project_mask, sum_fields] = sum_fields_sums
+    @staticmethod
+    def _get_subsidy_eligible_projects(data):
+        filtered_data = data[
+            data[invoice.IS_BILLABLE_FIELD] & ~data[invoice.MISSING_PI_FIELD]
+        ]
+        filtered_data = filtered_data[
+            filtered_data[invoice.INSTITUTION_FIELD] == "Boston University"
+        ].copy()
 
-        return data_no_dup
+        return filtered_data
 
     def _apply_subsidy(self, dataframe, subsidy_amount):
-        pi_list = dataframe[invoice.PI_FIELD].unique()
+        subsidy_eligible_projects = self._get_subsidy_eligible_projects(dataframe)
+        pi_list = subsidy_eligible_projects[invoice.PI_FIELD].unique()
 
         for pi in pi_list:
-            pi_projects = dataframe[dataframe[invoice.PI_FIELD] == pi]
+            pi_projects = subsidy_eligible_projects[
+                subsidy_eligible_projects[invoice.PI_FIELD] == pi
+            ]
             self.apply_flat_discount(
                 dataframe,
                 pi_projects,
+                invoice.PI_BALANCE_FIELD,
                 subsidy_amount,
                 invoice.SUBSIDY_FIELD,
                 invoice.BALANCE_FIELD,
diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py
index f09a7de..fb551e3 100644
--- a/process_report/tests/unit_tests.py
+++ b/process_report/tests/unit_tests.py
@@ -757,119 +757,179 @@ def test_apply_credit_error(self):
             test_invoice._get_pi_age(old_pi_df, "PI1", invoice_month)
 
 
-class TestBUSubsidy(TestCase):
-    def setUp(self):
-        data = {
-            "Invoice Month": [
-                "2024-03",
-                "2024-03",
-                "2024-03",
-                "2024-03",
-                "2024-03",
-                "2024-03",
-                "2024-03",
-                "2024-03",
-            ],
-            "Manager (PI)": ["PI1", "PI1", "PI2", "PI2", "PI3", "PI3", "PI4", "PI4"],
-            "Institution": [
-                "Boston University",
-                "Boston University",
+class TestBUSubsidyProcessor(TestCase):
+    def _assert_result_invoice(
+        self,
+        subsidy_amount,
+        test_invoice,
+        answer_invoice,
+        invoice_month="0000-00",
+    ):
+        new_bu_subsidy_proc = test_utils.new_bu_subsidy_processor(
+            invoice_month=invoice_month,
+            data=test_invoice,
+            subsidy_amount=subsidy_amount,
+        )
+        new_bu_subsidy_proc.process()
+        output_invoice = new_bu_subsidy_proc.data
+        answer_invoice = answer_invoice.astype(output_invoice.dtypes)
+        print(output_invoice)
+        print(answer_invoice)
+
+        self.assertTrue(output_invoice.equals(answer_invoice))
+
+    def _get_test_invoice(
+        self,
+        pi,
+        pi_balances,
+        balances=None,
+        project_names=None,
+        institution=None,
+        is_billable=None,
+        missing_pi=None,
+    ):
+        if not balances:
+            balances = pi_balances
+
+        if not project_names:
+            project_names = ["" for _ in range(len(pi))]
+
+        if not institution:
+            institution = ["Boston University" for _ in range(len(pi))]
+
+        if not is_billable:
+            is_billable = [True for _ in range(len(pi))]
+
+        if not missing_pi:
+            missing_pi = [False for _ in range(len(pi))]
+
+        return pandas.DataFrame(
+            {
+                "Manager (PI)": pi,
+                "Project - Allocation": project_names,
+                "PI Balance": pi_balances,
+                "Balance": balances,
+                "Institution": institution,
+                "Is Billable": is_billable,
+                "Missing PI": missing_pi,
+            }
+        )
+
+    def test_exclude_non_BU_pi(self):
+        """Are only BU PIs given the subsidy?"""
+
+        subsidy_amount = 100
+        test_invoice = self._get_test_invoice(
+            [str(i) for i in range(5)],
+            pi_balances=[subsidy_amount for _ in range(5)],
+            institution=[
                 "Boston University",
                 "Boston University",
-                "Harvard University",  # Test case for non-BU PIs
+                "boston university",
                 "Harvard University",
-                "Boston University",
-                "Boston University",
+                "BU",
             ],
-            "Project - Allocation": [
-                "ProjectA-e6413",
-                "ProjectA-t575e6",  # Test case for project with >1 allocation
-                "ProjectB-fddgfygg",
-                "ProjectB-5t143t",
-                "ProjectC-t14334",
-                "ProjectD",  # Test case for correctly extracting project name
-                "ProjectE-test-r25135",  # Test case for BU PI with >1 project
-                "ProjectF",
-            ],
-            "Cost": [1050, 500, 100, 925, 10000, 1000, 1050, 100],
-            "Credit": [
-                1000,
-                0,
-                100,
-                900,
-                0,
-                0,
-                1000,
-                0,
-            ],  # Test cases where PI does/dones't have credits alreadys
-            "Balance": [
-                50,
-                500,
-                0,
-                25,
-                10000,
-                1000,
-                50,
-                100,
-            ],  # Test case where subsidy does/doesn't cover fully balance
-            "Is Billable": [True, True, True, True, True, True, True, True],
-            "Missing PI": [False, False, False, False, False, False, False, False],
-        }
-        self.dataframe = pandas.DataFrame(data)
-        self.subsidy = 100
-
-    def test_apply_BU_subsidy(self):
-        test_invoice = test_utils.new_bu_internal_invoice(
-            data=self.dataframe, subsidy_amount=self.subsidy
         )
-        test_invoice.process()
-        output_df = test_invoice.data.reset_index()
 
-        self.assertTrue(
-            set(
-                [
-                    process_report.INVOICE_DATE_FIELD,
-                    "Project",
-                    process_report.PI_FIELD,
-                    process_report.COST_FIELD,
-                    process_report.CREDIT_FIELD,
-                    process_report.SUBSIDY_FIELD,
-                    process_report.BALANCE_FIELD,
-                ]
-            ).issubset(output_df)
+        answer_invoice = test_invoice.copy()
+        answer_invoice["Project"] = answer_invoice["Project - Allocation"]
+        answer_invoice["Subsidy"] = [subsidy_amount, subsidy_amount, 0, 0, 0]
+        answer_invoice["PI Balance"] = [
+            0,
+            0,
+            subsidy_amount,
+            subsidy_amount,
+            subsidy_amount,
+        ]
+
+        self._assert_result_invoice(subsidy_amount, test_invoice, answer_invoice)
+
+    def test_exclude_nonbillables(self):
+        """Are nonbillables excluded from the subsidy?"""
+        subsidy_amount = 100
+        test_invoice = self._get_test_invoice(
+            [str(i) for i in range(6)],
+            pi_balances=[subsidy_amount for _ in range(6)],
+            is_billable=[True, True, False, False, True, True],
+            missing_pi=[True, True, False, False, False, False],
         )
 
-        self.assertTrue(
-            set(["PI1", "PI2", "PI4"]).issubset(output_df["Manager (PI)"].unique())
+        answer_invoice = test_invoice.copy()
+        answer_invoice["Project"] = answer_invoice["Project - Allocation"]
+        answer_invoice["Subsidy"] = [0, 0, 0, 0, subsidy_amount, subsidy_amount]
+        answer_invoice["PI Balance"] = [
+            subsidy_amount,
+            subsidy_amount,
+            subsidy_amount,
+            subsidy_amount,
+            0,
+            0,
+        ]
+
+        self._assert_result_invoice(subsidy_amount, test_invoice, answer_invoice)
+
+    def test_one_pi_many_allocations(self):
+        """Is subsidy applied properly to BU PI with many allocations?"""
+
+        # Two projects, one allocation each
+        subsidy_amount = 100
+        test_invoice = self._get_test_invoice(
+            ["PI" for i in range(2)],
+            pi_balances=[60, 60],
+            project_names=["P1", "P2"],
         )
-        self.assertFalse("PI3" in output_df["Project"].unique())
 
-        self.assertTrue(
-            set(["ProjectA", "ProjectB", "ProjectE-test", "ProjectF"]).issubset(
-                output_df["Project"].unique()
-            )
+        answer_invoice = test_invoice.copy()
+        answer_invoice["Project"] = answer_invoice["Project - Allocation"]
+        answer_invoice["Subsidy"] = [60, 40]
+        answer_invoice["PI Balance"] = [0, 20]
+
+        self._assert_result_invoice(subsidy_amount, test_invoice, answer_invoice)
+
+        # Two projects, two allocations each
+        test_invoice = self._get_test_invoice(
+            ["PI" for i in range(4)],
+            pi_balances=[40, 40, 40, 40],
+            project_names=["P1-A1", "P1-A1-test", "P2", "P2-"],
         )
-        self.assertFalse(
-            set(["ProjectC-t14334", "ProjectC", "ProjectD"]).intersection(
-                output_df["Project"].unique()
-            )
+
+        answer_invoice = test_invoice.copy()
+        answer_invoice["Project"] = ["P1", "P1-A1", "P2", "P2"]
+        answer_invoice["Subsidy"] = [40, 40, 20, 0]
+        answer_invoice["PI Balance"] = [0, 0, 20, 40]
+
+        self._assert_result_invoice(subsidy_amount, test_invoice, answer_invoice)
+
+        # Two allocations, one where PI balance != NERC balance
+        test_invoice = self._get_test_invoice(
+            ["PI" for i in range(2)],
+            pi_balances=[80, 80],
+            project_names=["P1", "P2"],
+            balances=[100, 80],
         )
 
-        self.assertEqual(4, len(output_df.index))
-        self.assertEqual(1550, output_df.loc[0, "Cost"])
-        self.assertEqual(1025, output_df.loc[1, "Cost"])
-        self.assertEqual(1050, output_df.loc[2, "Cost"])
-        self.assertEqual(100, output_df.loc[3, "Cost"])
+        answer_invoice = test_invoice.copy()
+        answer_invoice["Project"] = answer_invoice["Project - Allocation"]
+        answer_invoice["Subsidy"] = [80, 20]
+        answer_invoice["PI Balance"] = [0, 60]
+
+        self._assert_result_invoice(subsidy_amount, test_invoice, answer_invoice)
 
-        self.assertEqual(100, output_df.loc[0, "Subsidy"])
-        self.assertEqual(25, output_df.loc[1, "Subsidy"])
-        self.assertEqual(50, output_df.loc[2, "Subsidy"])
-        self.assertEqual(50, output_df.loc[3, "Subsidy"])
+    def test_two_pi(self):
+        """Is subsidy applied to more than one PI?"""
+        # Each PI has two allocations
+        subsidy_amount = 100
+        test_invoice = self._get_test_invoice(
+            ["PI1", "PI1", "PI2", "PI2"],
+            pi_balances=[80, 80, 40, 40],
+        )
+
+        answer_invoice = test_invoice.copy()
+        answer_invoice["Project"] = answer_invoice["Project - Allocation"]
+        answer_invoice["Subsidy"] = [80, 20, 40, 40]
+        answer_invoice["PI Balance"] = [0, 60, 0, 0]
 
-        self.assertEqual(450, output_df.loc[0, "Balance"])
-        self.assertEqual(0, output_df.loc[1, "Balance"])
-        self.assertEqual(0, output_df.loc[2, "Balance"])
-        self.assertEqual(50, output_df.loc[3, "Balance"])
+        self._assert_result_invoice(subsidy_amount, test_invoice, answer_invoice)
 
 
 class TestLenovoProcessor(TestCase):
diff --git a/process_report/tests/util.py b/process_report/tests/util.py
index 6c2cb50..a022395 100644
--- a/process_report/tests/util.py
+++ b/process_report/tests/util.py
@@ -3,7 +3,6 @@
 from process_report.invoices import (
     invoice,
     billable_invoice,
-    bu_internal_invoice,
     pi_specific_invoice,
 )
 
@@ -13,6 +12,7 @@
     lenovo_processor,
     validate_billable_pi_processor,
     new_pi_credit_processor,
+    bu_subsidy_processor,
 )
 
 
@@ -40,14 +40,6 @@ def new_billable_invoice(
     )
 
 
-def new_bu_internal_invoice(
-    name="", invoice_month="0000-00", data=pandas.DataFrame(), subsidy_amount=0
-):
-    return bu_internal_invoice.BUInternalInvoice(
-        name, invoice_month, data, subsidy_amount
-    )
-
-
 def new_pi_specific_invoice(
     name="",
     invoice_month="0000-00",
@@ -106,3 +98,14 @@ def new_new_pi_credit_processor(
     return new_pi_credit_processor.NewPICreditProcessor(
         name, invoice_month, data, old_pi_filepath, limit_new_pi_credit_to_partners
     )
+
+
+def new_bu_subsidy_processor(
+    name="",
+    invoice_month="0000-00",
+    data=pandas.DataFrame(),
+    subsidy_amount=0,
+):
+    return bu_subsidy_processor.BUSubsidyProcessor(
+        name, invoice_month, data, subsidy_amount
+    )