-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from DLBD-Department/861n1umv4-create-the-brio…
…-python-library 861n1umv4 create the brio python library
- Loading branch information
Showing
17 changed files
with
3,615 additions
and
1,536 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,3 +140,8 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
/bin/ | ||
/include/ | ||
/lib/ | ||
/share/ | ||
/pyvenv.cfg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,52 @@ | ||
IMAGE_NAME="brio_frontend" | ||
VERSION=1.0 | ||
CONTAINER_NAME="brio" | ||
|
||
.PHONY: help build test shell stop | ||
|
||
help: | ||
@echo "- make build Build docker image" | ||
@echo "- make frontend Start the frontend application" | ||
@echo "- make shell Open a shell inside docker image" | ||
@echo "- make stop Stop the application" | ||
@echo "- make build Build docker image" | ||
@echo "- make frontend Start the frontend application is a container" | ||
@echo "- make shell Open a shell inside docker container" | ||
@echo "- make stop Stop the docker container" | ||
|
||
|
||
.DEFAULT_GOAL := help | ||
|
||
.PHONY: build | ||
build: | ||
@docker build --tag ${IMAGE_NAME}:latest --tag ${IMAGE_NAME}:${VERSION} . | ||
|
||
@docker build \ | ||
--tag ${IMAGE_NAME}:latest \ | ||
--tag ${IMAGE_NAME}:$$(cat VERSION.txt) \ | ||
. | ||
|
||
.PHONY: frontend | ||
frontend: build | ||
@docker run -dp 5000:5000 \ | ||
--name ${CONTAINER_NAME} \ | ||
--env HOST_IP=$(shell hostname -I | cut -d ' ' -f1) \ | ||
${IMAGE_NAME} | ||
|
||
shell: | ||
.PHONY: shell | ||
shell: | ||
@docker exec -it ${CONTAINER_NAME} /bin/bash | ||
|
||
.PHONY: stop | ||
stop: | ||
@docker stop ${CONTAINER_NAME} | ||
@docker rm ${CONTAINER_NAME} | ||
|
||
.PHONY: test | ||
test: | ||
@python3 -m tests.unit.TestBiasDetector | ||
|
||
.PHONY: venv | ||
venv: | ||
python3 -m virtualenv . | ||
. bin/activate; pip install -Ur requirements.txt | ||
. bin/activate; pip install -Ur requirements-dev.txt | ||
|
||
.PHONY: clean | ||
clean: | ||
-rm -rf build dist | ||
-rm -rf *.egg-info | ||
-rm -rf bin lib share pyvenv.cfg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
0.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from scipy.stats import entropy | ||
from itertools import combinations | ||
|
||
class KLDivergence: | ||
|
||
def __init__(self, | ||
aggregating_function=max): | ||
|
||
# function needed to aggregate distances for multi-class comparisons | ||
self.aggregating_function = aggregating_function | ||
|
||
|
||
def compute_distance_from_reference(self, | ||
observed_distribution, | ||
reference_distribution): | ||
''' | ||
observed_distribution: list of numpy arrays, | ||
each of them containing the distribution target_variable | root_variable. | ||
e.g. [ array(female_0, female_1), array(male_0, male_1) ] | ||
The lenght of the list is given by the number of categories of the root variable. | ||
The shape of each array is given by the number of labels of target_variable. | ||
reference_distribution: list of numpy arrays, | ||
each of them containing a reference distribution target_variable | root_variable. | ||
e.g. [ array(female_ref_0, female_ref_1), array(male_ref_0, male_ref_1) ] | ||
The lenght of the list is given by the number of categories of the root variable. | ||
The shape of each array is given by the number of labels of target_variable. | ||
''' | ||
|
||
divergences = [ | ||
entropy(pk=ref, qk=obs) for ref, obs in zip( | ||
reference_distribution, observed_distribution | ||
) | ||
] | ||
|
||
return divergences | ||
|
||
|
||
def compute_distance_between_frequencies(self, | ||
observed_distribution): | ||
''' | ||
observed_distribution: list of numpy arrays, | ||
each of them containing the distribution target_variable | root_variable. | ||
e.g. [ array(female_0, female_1), array(male_0, male_1) ] | ||
The lenght of the list is given by the number of categories of the root variable. | ||
The shape of each array is given by the number of labels of target_variable. | ||
It works for any number of labels of the target variable and any number of classes for the root variable. | ||
The final distance is given by self.aggregating_function. | ||
''' | ||
|
||
divergences = [] | ||
for pair in combinations(observed_distribution, 2): | ||
divergence = ( entropy(pk=pair[0], qk=pair[1]) + entropy(pk=pair[1], qk=pair[0]) )/2 | ||
divergences.append(divergence) | ||
|
||
divergence = self.aggregating_function(divergences) | ||
|
||
return divergence |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from itertools import combinations | ||
|
||
class TotalVariationDistance: | ||
|
||
def __init__(self, | ||
aggregating_function=max): | ||
|
||
# function needed to aggregate distances for multi-class comparisons | ||
self.aggregating_function = aggregating_function | ||
|
||
|
||
def compute_distance_from_reference(self, | ||
observed_distribution, | ||
reference_distribution): | ||
''' | ||
observed_distribution: list of numpy arrays, | ||
each of them containing the distribution target_variable | root_variable. | ||
e.g. [ array(female_0, female_1), array(male_0, male_1) ] | ||
The lenght of the list is given by the number of categories of the root variable. | ||
The shape of each array is given by the number of labels of target_variable. | ||
reference_distribution: list of numpy arrays, | ||
each of them containing a reference distribution target_variable | root_variable. | ||
e.g. [ array(female_ref_0, female_ref_1), array(male_ref_0, male_ref_1) ] | ||
The lenght of the list is given by the number of categories of the root variable. | ||
The shape of each array is given by the number of labels of target_variable. | ||
''' | ||
|
||
distances = [ | ||
max(abs(ref - obs)) for ref, obs in zip( | ||
reference_distribution, observed_distribution | ||
) | ||
] | ||
|
||
return distances | ||
|
||
|
||
def compute_distance_between_frequencies(self, | ||
observed_distribution): | ||
''' | ||
observed_distribution: list of numpy arrays, | ||
each of them containing the distribution target_variable | root_variable. | ||
e.g. [ array(female_0, female_1), array(male_0, male_1) ] | ||
The lenght of the list is given by the number of categories of the root variable. | ||
The shape of each array is given by the number of labels of target_variable. | ||
It works for any number of labels of the target variable and any number of classes for the root variable. | ||
The final distance is given by self.aggregating_function. | ||
It breaks if aggregating_function=stdev when the root_variable is binary (we would have a stdev | ||
of a single number) | ||
''' | ||
|
||
# Computing the TVD for each pair of distributions | ||
distance = self.aggregating_function( | ||
# TVD | ||
[max( abs( pair[0]-pair[1] ) ) for pair in combinations(observed_distribution, 2)] | ||
) | ||
|
||
return distance |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
import pandas as pd | ||
from sklearn.preprocessing import OneHotEncoder, StandardScaler | ||
|
||
|
||
class Preprocessing: | ||
|
||
def __init__(self, input_data_path, target_classification): | ||
self.input_data_path = input_data_path | ||
self.target = target_classification | ||
|
||
|
||
def read_dataframe(self): | ||
df = pd.read_csv(self.input_data_path) | ||
df[self.target] = df['y_default_payment_next_month'] | ||
|
||
Y = df[self.target] | ||
Y = Y.apply(lambda y: 1 if y is True else 0) | ||
X = df.drop(columns=[self.target], axis=1) | ||
|
||
return X, Y | ||
|
||
|
||
def preprocess_for_classification(self, | ||
df, | ||
fit_ohe=False, | ||
fitted_ohe=None, | ||
perform_scaling=False, | ||
fitted_scaler=None): | ||
|
||
categorical_cols = [ | ||
'x2_sex', | ||
'x3_education', | ||
'x4_marriage', | ||
'x6_pay_0', | ||
'x7_pay_2', | ||
'x8_pay_3', | ||
'x9_pay_4', | ||
'x10_pay_5', | ||
'x11_pay_6'] | ||
|
||
numerical_cols = [ | ||
'x1_limit_bal', | ||
'x5_age', | ||
'x12_bill_amt1', | ||
'x13_bill_amt2', | ||
'x14_bill_amt3', | ||
'x15_bill_amt4', | ||
'x16_bill_amt5', | ||
'x17_bill_amt6', | ||
'x18_pay_amt1', | ||
'x19_pay_amt2', | ||
'x20_pay_amt3', | ||
'x21_pay_amt4', | ||
'x22_pay_amt5', | ||
'x23_pay_amt6'] | ||
|
||
numeric_means = df[numerical_cols].mean() | ||
categ_modes = df[categorical_cols].mode().iloc[0] | ||
|
||
df = df.fillna(numeric_means).fillna(categ_modes) | ||
X = df | ||
|
||
if perform_scaling: | ||
if fitted_scaler is None: | ||
fitted_scaler = StandardScaler().fit(df[numerical_cols]) | ||
scaled = fitted_scaler.transform(df[numerical_cols]) | ||
scaled_df = pd.DataFrame(scaled, columns=fitted_scaler.get_feature_names_out(input_features=numerical_cols), index=df.index) | ||
df = pd.concat([df[categorical_cols], scaled_df], axis=1) | ||
X = df | ||
|
||
if fit_ohe: | ||
if fitted_ohe is None: | ||
fitted_ohe = OneHotEncoder( | ||
handle_unknown='ignore', | ||
sparse=False).fit(df[categorical_cols]) | ||
|
||
cat_ohe = fitted_ohe.transform(df[categorical_cols]) | ||
ohe_df = pd.DataFrame(cat_ohe, columns=fitted_ohe.get_feature_names_out(input_features = categorical_cols), index = df.index) | ||
X = pd.concat([df[numerical_cols], ohe_df], axis=1) | ||
|
||
|
||
return X, fitted_ohe, fitted_scaler | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.