Skip to content

Commit

Permalink
Merge pull request #12 from DLBD-Department/861n1umv4-create-the-brio…
Browse files Browse the repository at this point in the history
…-python-library

861n1umv4 create the brio python library
  • Loading branch information
ChristianQuaggioAlkemy authored Sep 7, 2023
2 parents 7186c4d + d771032 commit a385d39
Show file tree
Hide file tree
Showing 17 changed files with 3,615 additions and 1,536 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,8 @@ dmypy.json

# Pyre type checker
.pyre/
/bin/
/include/
/lib/
/share/
/pyvenv.cfg
38 changes: 30 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,30 +1,52 @@
IMAGE_NAME="brio_frontend"
VERSION=1.0
CONTAINER_NAME="brio"

.PHONY: help build test shell stop

help:
@echo "- make build Build docker image"
@echo "- make frontend Start the frontend application"
@echo "- make shell Open a shell inside docker image"
@echo "- make stop Stop the application"
@echo "- make build Build docker image"
@echo "- make frontend Start the frontend application is a container"
@echo "- make shell Open a shell inside docker container"
@echo "- make stop Stop the docker container"


.DEFAULT_GOAL := help

.PHONY: build
build:
@docker build --tag ${IMAGE_NAME}:latest --tag ${IMAGE_NAME}:${VERSION} .

@docker build \
--tag ${IMAGE_NAME}:latest \
--tag ${IMAGE_NAME}:$$(cat VERSION.txt) \
.

.PHONY: frontend
frontend: build
@docker run -dp 5000:5000 \
--name ${CONTAINER_NAME} \
--env HOST_IP=$(shell hostname -I | cut -d ' ' -f1) \
${IMAGE_NAME}

shell:
.PHONY: shell
shell:
@docker exec -it ${CONTAINER_NAME} /bin/bash

.PHONY: stop
stop:
@docker stop ${CONTAINER_NAME}
@docker rm ${CONTAINER_NAME}

.PHONY: test
test:
@python3 -m tests.unit.TestBiasDetector

.PHONY: venv
venv:
python3 -m virtualenv .
. bin/activate; pip install -Ur requirements.txt
. bin/activate; pip install -Ur requirements-dev.txt

.PHONY: clean
clean:
-rm -rf build dist
-rm -rf *.egg-info
-rm -rf bin lib share pyvenv.cfg
1 change: 1 addition & 0 deletions VERSION.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.0.1
21 changes: 10 additions & 11 deletions brio/bias/BiasDetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from itertools import chain
from itertools import combinations as itertools_combinations

class BiasDetector:

class BiasDetector:

def powerset(self, iterable):
'''
Expand All @@ -12,15 +12,14 @@ def powerset(self, iterable):
'''
"powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return chain.from_iterable(itertools_combinations(s, r) for r in range(len(s)+1))

return chain.from_iterable(itertools_combinations(s, r) for r in range(len(s) + 1))

def get_frequencies_list(self,
dataframe,
target_variable,
target_variable_labels,
root_variable,
root_variable_labels):
dataframe,
target_variable,
target_variable_labels,
root_variable,
root_variable_labels):
'''
This function builds a list of numpy arrays,
each of them containing the distribution target_variable | root_variable.
Expand All @@ -34,7 +33,7 @@ def get_frequencies_list(self,
abs_freq_list = []
for label in root_variable_labels:
dataframe_subset = dataframe.loc[
dataframe[root_variable]==label
dataframe[root_variable] == label
]

freq_list.append(
Expand All @@ -52,6 +51,6 @@ def get_frequencies_list(self,
)

return freq_list, abs_freq_list





58 changes: 58 additions & 0 deletions brio/bias/KLDivergence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from scipy.stats import entropy
from itertools import combinations

class KLDivergence:

def __init__(self,
aggregating_function=max):

# function needed to aggregate distances for multi-class comparisons
self.aggregating_function = aggregating_function


def compute_distance_from_reference(self,
observed_distribution,
reference_distribution):
'''
observed_distribution: list of numpy arrays,
each of them containing the distribution target_variable | root_variable.
e.g. [ array(female_0, female_1), array(male_0, male_1) ]
The lenght of the list is given by the number of categories of the root variable.
The shape of each array is given by the number of labels of target_variable.
reference_distribution: list of numpy arrays,
each of them containing a reference distribution target_variable | root_variable.
e.g. [ array(female_ref_0, female_ref_1), array(male_ref_0, male_ref_1) ]
The lenght of the list is given by the number of categories of the root variable.
The shape of each array is given by the number of labels of target_variable.
'''

divergences = [
entropy(pk=ref, qk=obs) for ref, obs in zip(
reference_distribution, observed_distribution
)
]

return divergences


def compute_distance_between_frequencies(self,
observed_distribution):
'''
observed_distribution: list of numpy arrays,
each of them containing the distribution target_variable | root_variable.
e.g. [ array(female_0, female_1), array(male_0, male_1) ]
The lenght of the list is given by the number of categories of the root variable.
The shape of each array is given by the number of labels of target_variable.
It works for any number of labels of the target variable and any number of classes for the root variable.
The final distance is given by self.aggregating_function.
'''

divergences = []
for pair in combinations(observed_distribution, 2):
divergence = ( entropy(pk=pair[0], qk=pair[1]) + entropy(pk=pair[1], qk=pair[0]) )/2
divergences.append(divergence)

divergence = self.aggregating_function(divergences)

return divergence
59 changes: 59 additions & 0 deletions brio/bias/TotalVariationDistance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from itertools import combinations

class TotalVariationDistance:

def __init__(self,
aggregating_function=max):

# function needed to aggregate distances for multi-class comparisons
self.aggregating_function = aggregating_function


def compute_distance_from_reference(self,
observed_distribution,
reference_distribution):
'''
observed_distribution: list of numpy arrays,
each of them containing the distribution target_variable | root_variable.
e.g. [ array(female_0, female_1), array(male_0, male_1) ]
The lenght of the list is given by the number of categories of the root variable.
The shape of each array is given by the number of labels of target_variable.
reference_distribution: list of numpy arrays,
each of them containing a reference distribution target_variable | root_variable.
e.g. [ array(female_ref_0, female_ref_1), array(male_ref_0, male_ref_1) ]
The lenght of the list is given by the number of categories of the root variable.
The shape of each array is given by the number of labels of target_variable.
'''

distances = [
max(abs(ref - obs)) for ref, obs in zip(
reference_distribution, observed_distribution
)
]

return distances


def compute_distance_between_frequencies(self,
observed_distribution):
'''
observed_distribution: list of numpy arrays,
each of them containing the distribution target_variable | root_variable.
e.g. [ array(female_0, female_1), array(male_0, male_1) ]
The lenght of the list is given by the number of categories of the root variable.
The shape of each array is given by the number of labels of target_variable.
It works for any number of labels of the target variable and any number of classes for the root variable.
The final distance is given by self.aggregating_function.
It breaks if aggregating_function=stdev when the root_variable is binary (we would have a stdev
of a single number)
'''

# Computing the TVD for each pair of distributions
distance = self.aggregating_function(
# TVD
[max( abs( pair[0]-pair[1] ) ) for pair in combinations(observed_distribution, 2)]
)

return distance
132 changes: 132 additions & 0 deletions brio/data_processing/Preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler


class Preprocessing:

def __init__(self, input_data_path, target_classification):
self.input_data_path = input_data_path
self.target = target_classification


def read_dataframe(self):
df = pd.read_csv(self.input_data_path)
df[self.target] = df['y_default_payment_next_month']

Y = df[self.target]
Y = Y.apply(lambda y: 1 if y is True else 0)
X = df.drop(columns=[self.target], axis=1)

return X, Y


def preprocess_for_classification(self,
df,
fit_ohe=False,
fitted_ohe=None,
perform_scaling=False,
fitted_scaler=None):

categorical_cols = [
'x2_sex',
'x3_education',
'x4_marriage',
'x6_pay_0',
'x7_pay_2',
'x8_pay_3',
'x9_pay_4',
'x10_pay_5',
'x11_pay_6']

numerical_cols = [
'x1_limit_bal',
'x5_age',
'x12_bill_amt1',
'x13_bill_amt2',
'x14_bill_amt3',
'x15_bill_amt4',
'x16_bill_amt5',
'x17_bill_amt6',
'x18_pay_amt1',
'x19_pay_amt2',
'x20_pay_amt3',
'x21_pay_amt4',
'x22_pay_amt5',
'x23_pay_amt6']

numeric_means = df[numerical_cols].mean()
categ_modes = df[categorical_cols].mode().iloc[0]

df = df.fillna(numeric_means).fillna(categ_modes)
X = df

if perform_scaling:
if fitted_scaler is None:
fitted_scaler = StandardScaler().fit(df[numerical_cols])
scaled = fitted_scaler.transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled, columns=fitted_scaler.get_feature_names_out(input_features=numerical_cols), index=df.index)
df = pd.concat([df[categorical_cols], scaled_df], axis=1)
X = df

if fit_ohe:
if fitted_ohe is None:
fitted_ohe = OneHotEncoder(
handle_unknown='ignore',
sparse=False).fit(df[categorical_cols])

cat_ohe = fitted_ohe.transform(df[categorical_cols])
ohe_df = pd.DataFrame(cat_ohe, columns=fitted_ohe.get_feature_names_out(input_features = categorical_cols), index = df.index)
X = pd.concat([df[numerical_cols], ohe_df], axis=1)


return X, fitted_ohe, fitted_scaler


















































1 change: 1 addition & 0 deletions frontend/app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from flask import Flask, render_template
from flask_cors import CORS
import os

from frontend.views import bias, opacity

app = Flask(__name__)
Expand Down
Loading

0 comments on commit a385d39

Please sign in to comment.