Skip to content

Commit

Permalink
Initial refactoring of process_report
Browse files Browse the repository at this point in the history
This refactor commit is the first of a few, to lay out intial structure

A new submodule, `invoices`, is added, containing a base class `Invoice` which is inherited by all other invoices.
Currently, only the lenovo and nonbillable invoice has classes which inherits from `Invoice`
Also created and partially populated an `util.py` file,
containing functions placed above `main()` in `process_report.py`
After the refactoring process is fully complete,
these utility functions will be completely removed from `process_report.py`
  • Loading branch information
QuanMPhm committed Jun 6, 2024
1 parent 0400557 commit 537ab88
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 56 deletions.
78 changes: 78 additions & 0 deletions process_report/invoices/invoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from dataclasses import dataclass
import pandas

import process_report.util as util


### Invoice field names
INVOICE_DATE_FIELD = "Invoice Month"
PROJECT_FIELD = "Project - Allocation"
PROJECT_ID_FIELD = "Project - Allocation ID"
PI_FIELD = "Manager (PI)"
INVOICE_EMAIL_FIELD = "Invoice Email"
INVOICE_ADDRESS_FIELD = "Invoice Address"
INSTITUTION_FIELD = "Institution"
INSTITUTION_ID_FIELD = "Institution - Specific Code"
SU_HOURS_FIELD = "SU Hours (GBhr or SUhr)"
SU_TYPE_FIELD = "SU Type"
COST_FIELD = "Cost"
CREDIT_FIELD = "Credit"
CREDIT_CODE_FIELD = "Credit Code"
SUBSIDY_FIELD = "Subsidy"
BALANCE_FIELD = "Balance"
###


@dataclass
class Invoice:
name: str
invoice_month: str
data: pandas.DataFrame

def process(self):
self._prepare()
self._process()
self._prepare_export()

@property
def output_path(self) -> str:
return f"{self.name} {self.invoice_month}.csv"

@property
def output_s3_key(self) -> str:
return f"Invoices/{self.invoice_month}/{self.name} {self.invoice_month}.csv"

@property
def output_s3_archive_key(self):
return f"Invoices/{self.invoice_month}/Archive/{self.name} {self.invoice_month} {util.get_iso8601_time()}.csv"

def _prepare(self):
"""Prepares the data for processing.
Implement in subclass if necessary. May add or remove columns
necessary for processing, add or remove rows, validate the data, or
perform simple substitutions.
"""
pass

def _process(self):
"""Processes the data.
Implement in subclass if necessary. Performs necessary calculations
on the data, e.g. applying subsidies or credits.
"""
pass

def _prepare_export(self):
"""Prepares the data for export.
Implement in subclass if necessary. May add or remove columns or rows
that should or should not be exported after processing."""
pass

def export(self):
self.data.to_csv(self.output_path)

def export_s3(self, s3_bucket):
s3_bucket.upload_file(self.output_path, self.output_s3_key)
s3_bucket.upload_file(self.output_path, self.output_s3_archive_key)
28 changes: 28 additions & 0 deletions process_report/invoices/lenovo_invoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from dataclasses import dataclass

import process_report.invoices.invoice as invoice


@dataclass
class LenovoInvoice(invoice.Invoice):
LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
SU_CHARGE_MULTIPLIER = 1

def _prepare(self):
self.data = self.data[
self.data[invoice.SU_TYPE_FIELD].isin(self.LENOVO_SU_TYPES)
][
[
invoice.INVOICE_DATE_FIELD,
invoice.PROJECT_FIELD,
invoice.INSTITUTION_FIELD,
invoice.SU_HOURS_FIELD,
invoice.SU_TYPE_FIELD,
]
].copy()

self.data.rename(columns={invoice.SU_HOURS_FIELD: "SU Hours"}, inplace=True)
self.data.insert(len(self.data.columns), "SU Charge", self.SU_CHARGE_MULTIPLIER)

def _process(self):
self.data["Charge"] = self.data["SU Hours"] * self.data["SU Charge"]
15 changes: 15 additions & 0 deletions process_report/invoices/nonbillable_invoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from dataclasses import dataclass

import process_report.invoices.invoice as invoice


@dataclass
class NonbillableInvoice(invoice.Invoice):
nonbillable_pis: list[str]
nonbillable_projects: list[str]

def _prepare_export(self):
self.data = self.data[
self.data[invoice.PI_FIELD].isin(self.nonbillable_pis)
| self.data[invoice.PROJECT_FIELD].isin(self.nonbillable_projects)
]
57 changes: 20 additions & 37 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import boto3
import pyarrow

from process_report.invoices import lenovo_invoice, nonbillable_invoice


### PI file field names
PI_PI_FIELD = "PI"
Expand Down Expand Up @@ -190,7 +192,7 @@ def main():
parser.add_argument(
"--nonbillable-file",
required=False,
default="nonbillable.csv",
default="nonbillable",
help="Name of nonbillable file",
)
parser.add_argument(
Expand Down Expand Up @@ -220,7 +222,7 @@ def main():
parser.add_argument(
"--Lenovo-file",
required=False,
default="Lenovo.csv",
default="Lenovo",
help="Name of output csv for Lenovo SU Types invoice",
)
parser.add_argument(
Expand Down Expand Up @@ -278,8 +280,22 @@ def main():

merged_dataframe = validate_pi_aliases(merged_dataframe, alias_dict)
merged_dataframe = add_institution(merged_dataframe)
export_lenovo(merged_dataframe, args.Lenovo_file)
remove_billables(merged_dataframe, pi, projects, args.nonbillable_file)
lenovo_inv = lenovo_invoice.LenovoInvoice(
name=args.Lenovo_file, invoice_month=invoice_month, data=merged_dataframe.copy()
)
nonbillable_inv = nonbillable_invoice.NonbillableInvoice(
name=args.nonbillable_file,
invoice_month=invoice_month,
data=merged_dataframe.copy(),
nonbillable_pis=pi,
nonbillable_projects=projects,
)
for invoice in [lenovo_inv, nonbillable_inv]:
invoice.process()
invoice.export()
if args.upload_to_s3:
bucket = get_invoice_bucket()
invoice.export_s3(bucket)

billable_projects = remove_non_billables(merged_dataframe, pi, projects)
billable_projects = validate_pi_names(billable_projects)
Expand All @@ -295,9 +311,7 @@ def main():

if args.upload_to_s3:
invoice_list = [
args.nonbillable_file,
args.output_file,
args.Lenovo_file,
]

for pi_invoice in os.listdir(args.output_folder):
Expand Down Expand Up @@ -371,17 +385,6 @@ def remove_non_billables(dataframe, pi, projects):
return filtered_dataframe


def remove_billables(dataframe, pi, projects, output_file):
"""Removes projects and PIs that should be billed from the dataframe
So this *keeps* the projects/pis that should not be billed.
"""
filtered_dataframe = dataframe[
dataframe[PI_FIELD].isin(pi) | dataframe[PROJECT_FIELD].isin(projects)
]
filtered_dataframe.to_csv(output_file, index=False)


def validate_pi_names(dataframe):
invalid_pi_projects = dataframe[pandas.isna(dataframe[PI_FIELD])]
for i, row in invalid_pi_projects.iterrows():
Expand Down Expand Up @@ -614,26 +617,6 @@ def export_HU_BU(dataframe, output_file):
HU_BU_projects.to_csv(output_file)


def export_lenovo(dataframe: pandas.DataFrame, output_file):
LENOVO_SU_TYPES = ["OpenShift GPUA100SXM4", "OpenStack GPUA100SXM4"]
SU_CHARGE_MULTIPLIER = 1

lenovo_df = dataframe[dataframe[SU_TYPE_FIELD].isin(LENOVO_SU_TYPES)][
[
INVOICE_DATE_FIELD,
PROJECT_FIELD,
INSTITUTION_FIELD,
SU_HOURS_FIELD,
SU_TYPE_FIELD,
]
].copy()

lenovo_df.rename(columns={SU_HOURS_FIELD: "SU Hours"}, inplace=True)
lenovo_df.insert(len(lenovo_df.columns), "SU Charge", SU_CHARGE_MULTIPLIER)
lenovo_df["Charge"] = lenovo_df["SU Hours"] * lenovo_df["SU Charge"]
lenovo_df.to_csv(output_file)


def upload_to_s3(invoice_list: list, invoice_month):
invoice_bucket = get_invoice_bucket()
for invoice_filename in invoice_list:
Expand Down
31 changes: 12 additions & 19 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from textwrap import dedent

from process_report import process_report
from process_report.invoices import lenovo_invoice, nonbillable_invoice


class TestGetInvoiceDate(TestCase):
Expand Down Expand Up @@ -73,6 +74,9 @@ def setUp(self):

self.pi_to_exclude = ["PI2", "PI3"]
self.projects_to_exclude = ["ProjectB", "ProjectD"]
self.nonbillable_invoice = nonbillable_invoice.NonbillableInvoice(
"Foo", "Foo", self.dataframe, self.pi_to_exclude, self.projects_to_exclude
)

self.output_file = tempfile.NamedTemporaryFile(delete=False)
self.output_file2 = tempfile.NamedTemporaryFile(delete=False)
Expand Down Expand Up @@ -106,14 +110,8 @@ def test_remove_non_billables(self):
self.assertIn("ProjectE", result_df["Project - Allocation"].tolist())

def test_remove_billables(self):
process_report.remove_billables(
self.dataframe,
self.pi_to_exclude,
self.projects_to_exclude,
self.output_file2.name,
)

result_df = pandas.read_csv(self.output_file2.name)
self.nonbillable_invoice.process()
result_df = self.nonbillable_invoice.data

self.assertIn("PI2", result_df["Manager (PI)"].tolist())
self.assertIn("PI3", result_df["Manager (PI)"].tolist())
Expand Down Expand Up @@ -754,18 +752,13 @@ def setUp(self):
"OpenStack GPUK80",
],
}
self.dataframe = pandas.DataFrame(data)

output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".csv")
self.output_file = output_file.name

def tearDown(self):
os.remove(self.output_file)

def test_apply_credit_0002(self):
process_report.export_lenovo(self.dataframe, self.output_file)
output_df = pandas.read_csv(self.output_file)
self.lenovo_invoice = lenovo_invoice.LenovoInvoice(
"Lenovo", "2023-01", pandas.DataFrame(data)
)
self.lenovo_invoice.process()

def test_process_lenovo(self):
output_df = self.lenovo_invoice.data
self.assertTrue(
set(
[
Expand Down
35 changes: 35 additions & 0 deletions process_report/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import datetime
import json
import logging


logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_institution_from_pi(institute_map, pi_uname):
institution_key = pi_uname.split("@")[-1]
institution_name = institute_map.get(institution_key, "")

if institution_name == "":
logger.warn(f"PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map


def get_iso8601_time():
return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")


def compare_invoice_month(month_1, month_2):
"""Returns True if 1st date is later than 2nd date"""
dt1 = datetime.datetime.strptime(month_1, "%Y-%m")
dt2 = datetime.datetime.strptime(month_2, "%Y-%m")
return dt1 > dt2

0 comments on commit 537ab88

Please sign in to comment.