Skip to content

Commit

Permalink
Merge pull request #96 from QuanMPhm/94/limit_credits
Browse files Browse the repository at this point in the history
Allow limiting New-PI credit to partner institutions
  • Loading branch information
naved001 authored Sep 30, 2024
2 parents 5d3638b + 696407c commit bc4f599
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 46 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ RUN pip install -r requirements.txt

COPY tools/ tools/
COPY process_report/process_report.py process_report/
COPY process_report/institute_map.json process_report/
COPY process_report/institute_list.yaml process_report/

CMD ["tools/clone_nonbillables_and_process_invoice.sh"]
67 changes: 67 additions & 0 deletions process_report/institute_list.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
- display_name: Northeastern University
domains:
- northeastern.edu
- display_name: Boston University
domains:
- bu.edu
- display_name: Bentley
domains:
- bentley.edu
- display_name: University of Rhode Island
domains:
- uri.edu
- display_name: Red Hat
domains:
- redhat.com
- display_name: Boston Childrens Hospital
domains:
- childrens.harvard.edu
- display_name: McLean Hospital
domains:
- mclean.harvard.edu
- display_name: Massachusetts Eye & Ear
domains:
- meei.harvard.edu
- display_name: Dana-Farber Cancer Institute
domains:
- dfci.harvard.edu
- display_name: Brigham and Women's Hospital
domains:
- bwh.harvard.edu
- display_name: Beth Israel Deaconess Medical Center
domains:
- bidmc.harvard.edu
- display_name: Harvard University
domains:
- harvard.edu
- chemistry.harvard.edu
- display_name: Worcester Polytechnic Institute
domains:
- wpi.edu
- display_name: Massachusetts Institute of Technology
domains:
- mit.edu
- display_name: University of Massachusetts Amherst
domains:
- umass.edu
- display_name: University of Massachusetts Lowell
domains:
- uml.edu
- display_name: Code For Boston
domains:
- codeforboston.org
- display_name: Yale University
domains:
- yale.edu
- display_name: Dartmouth College
domains:
- dartmouth.edu
- display_name: Photrek
domains:
- photrek.io
- display_name: Positron Networks
domains:
- positronnetworks.com
- display_name: Next Generation Justice
domains:
- nextgenjustice.llc
35 changes: 29 additions & 6 deletions process_report/invoices/billable_invoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class BillableInvoice(discount_invoice.DiscountInvoice):
nonbillable_pis: list[str]
nonbillable_projects: list[str]
old_pi_filepath: str
limit_new_pi_credit_to_partners: bool = False

@staticmethod
def _load_old_pis(old_pi_filepath) -> pandas.DataFrame:
Expand Down Expand Up @@ -115,6 +116,28 @@ def export_s3(self, s3_bucket):
super().export_s3(s3_bucket)
s3_bucket.upload_file(self.old_pi_filepath, self.PI_S3_FILEPATH)

def _filter_partners(self, data):
active_partnerships = list()
institute_list = util.load_institute_list()
for institute_info in institute_list:
if partnership_start_date := institute_info.get(
"mghpcc_partnership_start_date"
):
if util.get_month_diff(self.invoice_month, partnership_start_date) >= 0:
active_partnerships.append(institute_info["display_name"])

return data[data[invoice.INSTITUTION_FIELD].isin(active_partnerships)]

def _filter_excluded_su_types(self, data):
return data[~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES))]

def _get_credit_eligible_projects(self, data: pandas.DataFrame):
filtered_data = self._filter_excluded_su_types(data)
if self.limit_new_pi_credit_to_partners:
filtered_data = self._filter_partners(filtered_data)

return filtered_data

def _apply_credits_new_pi(
self, data: pandas.DataFrame, old_pi_df: pandas.DataFrame
):
Expand All @@ -140,19 +163,19 @@ def get_initial_credit_amount(
)
print(f"New PI Credit set at {new_pi_credit_amount} for {self.invoice_month}")

current_pi_set = set(data[invoice.PI_FIELD])
credit_eligible_projects = self._get_credit_eligible_projects(data)
current_pi_set = set(credit_eligible_projects[invoice.PI_FIELD])
for pi in current_pi_set:
credit_eligible_projects = data[
(data[invoice.PI_FIELD] == pi)
& ~(data[invoice.SU_TYPE_FIELD].isin(self.EXCLUDE_SU_TYPES))
pi_projects = credit_eligible_projects[
credit_eligible_projects[invoice.PI_FIELD] == pi
]
pi_age = self._get_pi_age(old_pi_df, pi, self.invoice_month)
pi_old_pi_entry = old_pi_df.loc[
old_pi_df[invoice.PI_PI_FIELD] == pi
].squeeze()

if pi_age > 1:
for i, row in credit_eligible_projects.iterrows():
for i, row in pi_projects.iterrows():
data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD]
else:
if pi_age == 0:
Expand Down Expand Up @@ -180,7 +203,7 @@ def get_initial_credit_amount(

credits_used = self.apply_flat_discount(
data,
credit_eligible_projects,
pi_projects,
remaining_credit,
invoice.CREDIT_FIELD,
invoice.BALANCE_FIELD,
Expand Down
45 changes: 15 additions & 30 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import sys
import datetime

import json
import pandas
import pyarrow
from nerc_rates import load_from_url

from process_report.util import get_invoice_bucket, process_and_export_invoices
from process_report import util
from process_report.invoices import (
lenovo_invoice,
nonbillable_invoice,
Expand Down Expand Up @@ -51,26 +51,6 @@
ALIAS_S3_FILEPATH = "PIs/alias.csv"


def get_institution_from_pi(institute_map, pi_uname):
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map


def load_alias(alias_file):
alias_dict = dict()

Expand Down Expand Up @@ -236,16 +216,20 @@ def main():
if args.upload_to_s3:
backup_to_s3_old_pi_file(old_pi_file)

rates_info = load_from_url()
billable_inv = billable_invoice.BillableInvoice(
name=args.output_file,
invoice_month=invoice_month,
data=merged_dataframe.copy(),
nonbillable_pis=pi,
nonbillable_projects=projects,
old_pi_filepath=old_pi_file,
limit_new_pi_credit_to_partners=rates_info.get_value_at(
"Limit New PI Credit to MGHPCC Partners", invoice_month
),
)

process_and_export_invoices(
util.process_and_export_invoices(
[lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3
)

Expand All @@ -266,15 +250,15 @@ def main():
name=args.output_folder, invoice_month=invoice_month, data=billable_inv.data
)

process_and_export_invoices(
util.process_and_export_invoices(
[nerc_total_inv, bu_internal_inv, pi_inv], args.upload_to_s3
)


def fetch_s3_invoices(invoice_month):
"""Fetches usage invoices from S3 given invoice month"""
s3_invoice_list = list()
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
for obj in invoice_bucket.objects.filter(
Prefix=f"Invoices/{invoice_month}/Service Invoices/"
):
Expand Down Expand Up @@ -339,20 +323,20 @@ def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):

def fetch_s3_alias_file():
local_name = "alias.csv"
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.download_file(ALIAS_S3_FILEPATH, local_name)
return local_name


def fetch_s3_old_pi_file():
local_name = "PI.csv"
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.download_file(PI_S3_FILEPATH, local_name)
return local_name


def backup_to_s3_old_pi_file(old_pi_file):
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv")


Expand All @@ -368,14 +352,15 @@ def add_institution(dataframe: pandas.DataFrame):
The list of mappings are defined in `institute_map.json`.
"""
institute_map = load_institute_map()
institute_list = util.load_institute_list()
institute_map = util.get_institute_mapping(institute_list)
dataframe = dataframe.astype({INSTITUTION_FIELD: "str"})
for i, row in dataframe.iterrows():
pi_name = row[PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[PROJECT_FIELD]} has no PI")
else:
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(
dataframe.at[i, INSTITUTION_FIELD] = util.get_institution_from_pi(
institute_map, pi_name
)

Expand Down
39 changes: 37 additions & 2 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def test_get_pi_institution(self):

for pi_email, answer in answers.items():
self.assertEqual(
process_report.get_institution_from_pi(institute_map, pi_email), answer
util.get_institution_from_pi(institute_map, pi_email), answer
)


Expand Down Expand Up @@ -789,7 +789,7 @@ def test_process_lenovo(self):


class TestUploadToS3(TestCase):
@mock.patch("process_report.process_report.get_invoice_bucket")
@mock.patch("process_report.util.get_invoice_bucket")
@mock.patch("process_report.util.get_iso8601_time")
def test_upload_to_s3(self, mock_get_time, mock_get_bucket):
mock_bucket = mock.MagicMock()
Expand Down Expand Up @@ -833,3 +833,38 @@ def test_upload_to_s3(self, mock_get_time, mock_get_bucket):

for i, call_args in enumerate(mock_bucket.upload_file.call_args_list):
self.assertTrue(answers[i] in call_args)


class TestNERCRates(TestCase):
@mock.patch("process_report.util.load_institute_list")
def test_flag_limit_new_pi_credit(self, mock_load_institute_list):
mock_load_institute_list.return_value = [
{"display_name": "BU", "mghpcc_partnership_start_date": "2024-02"},
{"display_name": "HU", "mghpcc_partnership_start_date": "2024-6"},
{"display_name": "NEU", "mghpcc_partnership_start_date": "2024-11"},
]
sample_df = pandas.DataFrame(
{
"Institution": ["BU", "HU", "NEU", "MIT", "BC"],
}
)
sample_inv = test_utils.new_billable_invoice(
limit_new_pi_credit_to_partners=True
)

# When no partnerships are active
sample_inv.invoice_month = "2024-01"
output_df = sample_inv._filter_partners(sample_df)
self.assertTrue(output_df.empty)

# When some partnerships are active
sample_inv.invoice_month = "2024-06"
output_df = sample_inv._filter_partners(sample_df)
answer_df = pandas.DataFrame({"Institution": ["BU", "HU"]})
self.assertTrue(output_df.equals(answer_df))

# When all partnerships are active
sample_inv.invoice_month = "2024-12"
output_df = sample_inv._filter_partners(sample_df)
answer_df = pandas.DataFrame({"Institution": ["BU", "HU", "NEU"]})
self.assertTrue(output_df.equals(answer_df))
2 changes: 2 additions & 0 deletions process_report/tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def new_billable_invoice(
nonbillable_pis=[],
nonbillable_projects=[],
old_pi_filepath="",
limit_new_pi_credit_to_partners=False,
):
return billable_invoice.BillableInvoice(
name,
Expand All @@ -31,6 +32,7 @@ def new_billable_invoice(
nonbillable_pis,
nonbillable_projects,
old_pi_filepath,
limit_new_pi_credit_to_partners,
)


Expand Down
26 changes: 19 additions & 7 deletions process_report/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import datetime
import json
import yaml
import logging
import functools

Expand All @@ -27,21 +27,33 @@ def get_invoice_bucket():
return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing"))


def get_institute_mapping(institute_list: list):
institute_map = dict()
for institute_info in institute_list:
for domain in institute_info["domains"]:
institute_map[domain] = institute_info["display_name"]

return institute_map


def get_institution_from_pi(institute_map, pi_uname):
institution_key = pi_uname.split("@")[-1]
institution_name = institute_map.get(institution_key, "")
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
logger.warn(f"PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)
def load_institute_list():
with open("process_report/institute_list.yaml", "r") as f:
institute_list = yaml.safe_load(f)

return institute_map
return institute_list


def get_iso8601_time():
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
git+https://github.com/CCI-MOC/nerc-rates@74eb4a7#egg=nerc_rates
pandas
pyarrow
boto3

0 comments on commit bc4f599

Please sign in to comment.