Merge pull request #12 from DLBD-Department/861n1umv4-create-the-brio…

…-python-library 861n1umv4 create the brio python library
DLBD-Department · Sep 7, 2023 · a385d39 · a385d39
2 parents 7186c4d + d771032
commit a385d39
Show file tree

Hide file tree

Showing 17 changed files with 3,615 additions and 1,536 deletions.
diff --git a/.gitignore b/.gitignore
@@ -140,3 +140,8 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+/bin/
+/include/
+/lib/
+/share/
+/pyvenv.cfg
diff --git a/Makefile b/Makefile
@@ -1,30 +1,52 @@
 IMAGE_NAME="brio_frontend"
-VERSION=1.0
 CONTAINER_NAME="brio"
 
 .PHONY: help build test shell stop
 
 help:
-	@echo "- make build                 Build docker image"
-	@echo "- make frontend              Start the frontend application"
-	@echo "- make shell		    Open a shell inside docker image"
-	@echo "- make stop		    Stop the application"	
+	@echo "- make build         Build docker image"
+	@echo "- make frontend      Start the frontend application is a container"
+	@echo "- make shell		    Open a shell inside docker container"
+	@echo "- make stop		    Stop the docker container"
 
 
 .DEFAULT_GOAL := help
 
+.PHONY: build
 build:
-	@docker build --tag ${IMAGE_NAME}:latest --tag ${IMAGE_NAME}:${VERSION} .
-
+	@docker build \
+		--tag ${IMAGE_NAME}:latest \
+		--tag ${IMAGE_NAME}:$$(cat VERSION.txt) \
+		.
 
+.PHONY: frontend
 frontend: build
 	@docker run -dp 5000:5000 \
 		--name ${CONTAINER_NAME} \
 		--env HOST_IP=$(shell hostname -I | cut -d ' ' -f1) \
 		${IMAGE_NAME}
 
-shell: 
+.PHONY: shell
+shell:
 	@docker exec -it ${CONTAINER_NAME} /bin/bash
+
+.PHONY: stop
 stop:
 	@docker stop ${CONTAINER_NAME}
 	@docker rm ${CONTAINER_NAME}
+
+.PHONY: test
+test:
+	@python3 -m tests.unit.TestBiasDetector
+
+.PHONY: venv
+venv:
+	python3 -m virtualenv .
+	. bin/activate; pip install -Ur requirements.txt
+	. bin/activate; pip install -Ur requirements-dev.txt
+
+.PHONY: clean
+clean:
+	-rm -rf build dist
+	-rm -rf *.egg-info
+	-rm -rf bin lib share pyvenv.cfg
diff --git a/VERSION.txt b/VERSION.txt
@@ -0,0 +1 @@
+0.0.1
diff --git a/brio/bias/BiasDetector.py b/brio/bias/BiasDetector.py
@@ -2,8 +2,8 @@
 from itertools import chain
 from itertools import combinations as itertools_combinations
 
-class BiasDetector:
 
+class BiasDetector:
 
     def powerset(self, iterable):
         '''
@@ -12,15 +12,14 @@ def powerset(self, iterable):
         '''
         "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
         s = list(iterable)
-        return chain.from_iterable(itertools_combinations(s, r) for r in range(len(s)+1))
-
+        return chain.from_iterable(itertools_combinations(s, r) for r in range(len(s) + 1))
 
     def get_frequencies_list(self,
-            dataframe,
-            target_variable,
-            target_variable_labels,
-            root_variable,
-            root_variable_labels):
+                             dataframe,
+                             target_variable,
+                             target_variable_labels,
+                             root_variable,
+                             root_variable_labels):
         '''
         This function builds a list of numpy arrays, 
         each of them containing the distribution target_variable | root_variable. 
@@ -34,7 +33,7 @@ def get_frequencies_list(self,
         abs_freq_list = []
         for label in root_variable_labels:
             dataframe_subset = dataframe.loc[
-                dataframe[root_variable]==label
+                dataframe[root_variable] == label
             ]
 
             freq_list.append(
@@ -52,6 +51,6 @@ def get_frequencies_list(self,
             )
 
         return freq_list, abs_freq_list
-
 
-
+
+
diff --git a/brio/bias/KLDivergence.py b/brio/bias/KLDivergence.py
@@ -0,0 +1,58 @@
+from scipy.stats import entropy
+from itertools import combinations
+
+class KLDivergence:
+
+    def __init__(self, 
+        aggregating_function=max):
+
+        # function needed to aggregate distances for multi-class comparisons
+        self.aggregating_function = aggregating_function
+
+
+    def compute_distance_from_reference(self, 
+            observed_distribution,
+            reference_distribution):
+        '''
+        observed_distribution: list of numpy arrays, 
+            each of them containing the distribution target_variable | root_variable. 
+            e.g. [ array(female_0, female_1), array(male_0, male_1) ]  
+            The lenght of the list is given by the number of categories of the root variable.
+            The shape of each array is given by the number of labels of target_variable.
+        reference_distribution: list of numpy arrays, 
+            each of them containing a reference distribution target_variable | root_variable. 
+            e.g. [ array(female_ref_0, female_ref_1), array(male_ref_0, male_ref_1) ]  
+            The lenght of the list is given by the number of categories of the root variable.
+            The shape of each array is given by the number of labels of target_variable.
+        '''
+
+        divergences = [
+                entropy(pk=ref, qk=obs) for ref, obs in zip(
+                    reference_distribution, observed_distribution
+                    )
+                ]
+
+        return divergences
+
+
+    def compute_distance_between_frequencies(self, 
+            observed_distribution):
+        '''
+        observed_distribution: list of numpy arrays, 
+            each of them containing the distribution target_variable | root_variable. 
+            e.g. [ array(female_0, female_1), array(male_0, male_1) ]  
+            The lenght of the list is given by the number of categories of the root variable.
+            The shape of each array is given by the number of labels of target_variable.
+            
+        It works for any number of labels of the target variable and any number of classes for the root variable. 
+        The final distance is given by self.aggregating_function. 
+        '''
+
+        divergences = []
+        for pair in combinations(observed_distribution, 2):
+            divergence = ( entropy(pk=pair[0], qk=pair[1]) + entropy(pk=pair[1], qk=pair[0]) )/2
+            divergences.append(divergence)
+
+        divergence = self.aggregating_function(divergences)
+
+        return divergence
diff --git a/brio/bias/TotalVariationDistance.py b/brio/bias/TotalVariationDistance.py
@@ -0,0 +1,59 @@
+from itertools import combinations
+
+class TotalVariationDistance:
+
+    def __init__(self, 
+        aggregating_function=max):
+
+        # function needed to aggregate distances for multi-class comparisons
+        self.aggregating_function = aggregating_function
+
+
+    def compute_distance_from_reference(self, 
+            observed_distribution,
+            reference_distribution):
+        '''
+        observed_distribution: list of numpy arrays, 
+            each of them containing the distribution target_variable | root_variable. 
+            e.g. [ array(female_0, female_1), array(male_0, male_1) ]  
+            The lenght of the list is given by the number of categories of the root variable.
+            The shape of each array is given by the number of labels of target_variable.
+        reference_distribution: list of numpy arrays, 
+            each of them containing a reference distribution target_variable | root_variable. 
+            e.g. [ array(female_ref_0, female_ref_1), array(male_ref_0, male_ref_1) ]  
+            The lenght of the list is given by the number of categories of the root variable.
+            The shape of each array is given by the number of labels of target_variable.
+        '''
+
+        distances = [
+                max(abs(ref - obs)) for ref, obs in zip(
+                    reference_distribution, observed_distribution
+                    )
+                ]
+
+        return distances
+
+
+    def compute_distance_between_frequencies(self, 
+            observed_distribution):
+        '''
+        observed_distribution: list of numpy arrays, 
+            each of them containing the distribution target_variable | root_variable. 
+            e.g. [ array(female_0, female_1), array(male_0, male_1) ]  
+            The lenght of the list is given by the number of categories of the root variable.
+            The shape of each array is given by the number of labels of target_variable.
+            
+        It works for any number of labels of the target variable and any number of classes for the root variable. 
+        The final distance is given by self.aggregating_function. 
+
+        It breaks if aggregating_function=stdev when the root_variable is binary (we would have a stdev
+        of a single number)
+        '''
+
+        # Computing the TVD for each pair of distributions
+        distance = self.aggregating_function(
+                # TVD
+                [max( abs( pair[0]-pair[1] ) ) for pair in combinations(observed_distribution, 2)]
+            )
+
+        return distance
diff --git a/brio/data_processing/Preprocessing.py b/brio/data_processing/Preprocessing.py
@@ -0,0 +1,132 @@
+import pandas as pd
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+
+
+class Preprocessing:
+
+    def __init__(self, input_data_path, target_classification):
+        self.input_data_path = input_data_path
+        self.target = target_classification
+
+
+    def read_dataframe(self):
+        df = pd.read_csv(self.input_data_path)
+        df[self.target] = df['y_default_payment_next_month']
+
+        Y = df[self.target]
+        Y = Y.apply(lambda y: 1 if y is True else 0) 
+        X = df.drop(columns=[self.target], axis=1)
+
+        return X, Y
+
+
+    def preprocess_for_classification(self,
+            df,
+            fit_ohe=False,
+            fitted_ohe=None,
+            perform_scaling=False,
+            fitted_scaler=None):
+
+        categorical_cols = [
+                'x2_sex',
+                'x3_education',
+                'x4_marriage',
+                'x6_pay_0',
+                'x7_pay_2',
+                'x8_pay_3',
+                'x9_pay_4',
+                'x10_pay_5',
+                'x11_pay_6']
+
+        numerical_cols = [
+                'x1_limit_bal',
+                'x5_age',
+                'x12_bill_amt1', 
+                'x13_bill_amt2', 
+                'x14_bill_amt3',
+                'x15_bill_amt4',
+                'x16_bill_amt5', 
+                'x17_bill_amt6', 
+                'x18_pay_amt1',
+                'x19_pay_amt2', 
+                'x20_pay_amt3', 
+                'x21_pay_amt4', 
+                'x22_pay_amt5',
+                'x23_pay_amt6']
+
+        numeric_means = df[numerical_cols].mean()
+        categ_modes = df[categorical_cols].mode().iloc[0]
+
+        df = df.fillna(numeric_means).fillna(categ_modes)
+        X = df
+
+        if perform_scaling:
+            if fitted_scaler is None:
+                fitted_scaler = StandardScaler().fit(df[numerical_cols])
+            scaled = fitted_scaler.transform(df[numerical_cols]) 
+            scaled_df = pd.DataFrame(scaled, columns=fitted_scaler.get_feature_names_out(input_features=numerical_cols), index=df.index)
+            df = pd.concat([df[categorical_cols], scaled_df], axis=1) 
+            X = df
+
+        if fit_ohe:
+            if fitted_ohe is None:
+                fitted_ohe = OneHotEncoder(
+                            handle_unknown='ignore',
+                            sparse=False).fit(df[categorical_cols])
+
+            cat_ohe = fitted_ohe.transform(df[categorical_cols])
+            ohe_df = pd.DataFrame(cat_ohe, columns=fitted_ohe.get_feature_names_out(input_features = categorical_cols), index = df.index)
+            X = pd.concat([df[numerical_cols], ohe_df], axis=1)
+
+
+        return X, fitted_ohe, fitted_scaler
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/frontend/app.py b/frontend/app.py
@@ -1,6 +1,7 @@
 from flask import Flask, render_template
 from flask_cors import CORS
 import os
+
 from frontend.views import bias, opacity
 
 app = Flask(__name__)