Merge pull request #3 from ymurong/dev

Dev
ymurong · Jan 26, 2023 · 94d0cf1 · 94d0cf1
2 parents 9a34352 + 4c3459c
commit 94d0cf1
Show file tree

Hide file tree

Showing 118 changed files with 443,553 additions and 304,522 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,7 @@ __pycache__
 transactions.sqlite3
 test_transactions.sqlite3
 backend/venv
+backend/venv10
+backend/venv11
 backend/dsp_backend.egg-info
+node_modules
diff --git a/EDP.ipynb b/EDP.ipynb
diff --git a/README.md b/README.md
@@ -5,10 +5,10 @@
   - [Presentation Slides](#presentation-slides)
   - [Explorative Analysis](#explorative-analysis)
   - [Visualization \& Inferential Analysis](#visualization--inferential-analysis)
-    - [Feature Engineering](#feature-engineering)
-    - [Classifier Training/Evaluation](#classifier-trainingevaluation)
-    - [Backend](#backend)
-    - [Dashboard](#dashboard)
+  - [Feature Engineering](#feature-engineering)
+  - [Classifier Training/Evaluation](#classifier-trainingevaluation)
+  - [Backend](#backend)
+  - [Dashboard](#dashboard)
 
 ## UvA Deadlines
 
@@ -55,31 +55,37 @@ Related Notebooks:
   * eur amounts outlier for a given past time range
 
 
-### Feature Engineering
+## Feature Engineering
 
 Check the [FE_README.md](./feature-engineering/README.md) for details.
 
-### Classifier Training/Evaluation
+## Classifier Training/Evaluation
 
 Check the [CLASSIFIER_README.md](./classifier/README.md) for details.
 
 
-### Backend
+## Backend
 
 Check the [BACKEND_README.md](./backend/README.md) for details. 
 
 The backend provides online openapi documentation http://127.0.0.1:8000/docs 
 
 
-### Dashboard
+## Dashboard
 
 Check the [DASHBOARD_README.md](./dashboard-front/coreui/README.md) for details. 
 
 
+## References
 
+[APATE: A novel approach for automated credit card transaction fraud detection using network-
+based extensions" by Véronique Van Vlasselaer, Cristián Bravo, Olivier Caelen, Tina
+Eliassa-Rad, Leman Akoglu, Monique Snoeck, and Bart Baesens](https://reader.elsevier.com/reader/sd/pii/S0167923615000846?token=E3DFBEAF6A07EFC01346AC1C4E53E845177BDDA1FEC3D7840030E6AF757DF39126AD174F5A946F33EC1616CFED756B6A&originRegion=eu-west-1&originCreation=20230124164812)
 
 
+[A graph-based, semi-supervised, credit card fraud detection system](https://b-lebichot.github.io/publications/A%20graph-based,%20semi-supervised,%20credit%20card%20fraud%20detection%20system%20-%20preprint.pdf)
 
+[Towards automated feature engineering for credit
+card fraud detection using multi-perspective HMMs](https://hal.science/hal-02278223/file/S0167739X19300664.pdf)
 
-
-
+[Assessment and mitigation of fairness issues in credit-card default models](https://fairlearn.org/main/auto_examples/plot_credit_loan_decisions.html#sphx-glr-auto-examples-plot-credit-loan-decisions-py)
diff --git a/backend/README.md b/backend/README.md
@@ -8,11 +8,11 @@
 # Setup Local Dev Environment
 
 ## 1.virtualenv setup
-The following example use python 3.11. The version of python must be greater than 3.6.
+The following example use python 3.10. The version of python must be greater than 3.6.
 Make sure you run the following command in the backend directory so that the venv directory is in parallel with src directory.
 ```bash
-pip3.11 install virtualenv
-virtualenv venv --python=python3.11
+pip3.10 install virtualenv
+virtualenv venv --python=python3.10
 ```
 
 ## 2.dependencies installations
@@ -71,3 +71,11 @@ Run the following command to test.
 ```bash
 pytest
 ```
+
+
+# How to generate dump files
+#### 1. prediction_dump.csv
+we use the [prediction_generator.py](../classifier/prediction_generator.py) to generate prediction probability based on given model (for now, random forest is used as it is the best based on our experiments)
+
+#### 2. transactions_dump.csv
+we use the [transactions_dump.ipynb](./transactions_dump.ipynb) to generate all the historic transactions given by adyen.
diff --git a/backend/predictions_dump.csv b/backend/predictions_dump.csv
diff --git a/backend/predictions_dump_december.csv b/backend/predictions_dump_december.csv
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -8,6 +8,7 @@ chardet==5.1.0
 click==8.1.3
 colorama==0.4.6
 cryptography==39.0.0
+dill==0.3.6
 dnspython==2.2.1
 ecdsa==0.18.0
 email-validator==1.3.0
@@ -19,6 +20,8 @@ idna==3.4
 importlib-metadata==6.0.0
 iniconfig==1.1.1
 Jinja2==3.1.2
+lime==0.2.0.1
+lightgbm==3.3.4
 MarkupSafe==2.1.1
 matplotlib==3.6.2
 packaging==22.0
@@ -37,6 +40,7 @@ requests==2.28.1
 rsa==4.9
 six==1.16.0
 seaborn==0.12.2
+scikit-learn==1.2.0
 SQLAlchemy==1.4.42
 starlette==0.22.0
 toml==0.10.2

diff --git a/backend/scikit_image-0.19.3-cp311-cp311-win_amd64.whl b/backend/scikit_image-0.19.3-cp311-cp311-win_amd64.whl
diff --git a/backend/src/common/BasePipeline.py b/backend/src/common/BasePipeline.py
@@ -1,8 +1,19 @@
 from abc import ABCMeta, abstractmethod
 import logging
+import numpy as np
+import dill as pickle
 import os
+from typing import Dict
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
 from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, recall_score, \
-    precision_score
+    precision_score, balanced_accuracy_score
+from fairlearn.metrics import (
+    false_positive_rate,
+    false_negative_rate,
+)
+from fairlearn.metrics import MetricFrame
 
 logging.getLogger(__name__)
 
@@ -11,22 +22,53 @@ class BasePipeline(metaclass=ABCMeta):
     def __init__(self, model_file_name, model_training=False, **kwargs):
         self.model_file_name = model_file_name
         self.metrics = None
+        self.fairness_metrics = None
+        self.explainer = None
 
     @abstractmethod
-    def predict(self, X):
+    def explain(self, transaction_sample: np.ndarray, ) -> dict:
         pass
 
-    @abstractmethod
-    def save_pipeline(self):
-        pass
+    def load_explainer(self, explainer_file_name):
+        dir = os.path.dirname(os.path.abspath(__file__))
+        fname = os.path.join(dir, explainer_file_name)
+        with open(fname, 'rb') as handle:
+            pickled_explainer = pickle.load(handle)
+            self.explainer = pickled_explainer
 
-    @abstractmethod
-    def load_pipeline(self):
-        pass
+    def predict_proba(self, X_test: pd.DataFrame):
+        """
+        :param X_test:
+        :return: probability of being positive
+        """
+        return self.pipeline.predict_proba(X_test.copy())[:, 1]
+
+    def predict(self, X_test: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
+        y_predict = (self.pipeline.predict_proba(X_test.copy())[:, 1] >= threshold).astype(bool)
+        return y_predict
 
-    def _check_dir(self, directory):
-        if not os.path.isdir(directory):
-            os.makedirs(directory)
+    def load_pipeline(self, **kwargs) -> None:
+        dir = os.path.dirname(os.path.abspath(__file__))
+        fname = os.path.join(dir, self.model_file_name)
+        with open(fname, 'rb') as handle:
+            pickled_model = pickle.load(handle)
+            self.pipeline = pickled_model
+
+    def save_pipeline(self) -> None:
+        dir = os.path.dirname(os.path.abspath(__file__))
+        fname = os.path.join(dir, self.model_file_name)
+        with open(fname, 'wb') as handle:
+            pickle.dump(self.pipeline, handle)
+
+    def eval(self, X_test: pd.DataFrame, y_test: pd.DataFrame, threshold: float = 0.5):
+        predicted = self.predict(X_test=X_test.copy(), threshold=threshold)
+        self.metrics_sklearn(y_true=y_test, y_pred=predicted)
+        return self.metrics
+
+    def eval_fairness(self, X_test: pd.DataFrame, y_test: pd.DataFrame, A_test: pd.DataFrame, threshold: float = 0.5):
+        predicted = self.predict(X_test=X_test.copy(), threshold=threshold)
+        self.metrics_fairlearn(y_true=y_test, y_pred=predicted, sensitive_features=A_test)
+        return self.fairness_metrics
 
     def metrics_sklearn(self, y_true, y_pred):
         fpr, tpr, thresholds = roc_curve(y_true, y_pred)
@@ -45,5 +87,25 @@ def metrics_sklearn(self, y_true, y_pred):
         }
         return self.metrics
 
+    def metrics_fairlearn(self, y_true, y_pred, sensitive_features) -> Dict:
+        fairness_metrics = {
+            "balanced_accuracy": balanced_accuracy_score,
+            "false_positive_rate": false_positive_rate,
+            "false_negative_rate": false_negative_rate,
+        }
+        metricframe_unmitigated = MetricFrame(
+            metrics=fairness_metrics,
+            y_true=y_true,
+            y_pred=y_pred,
+            sensitive_features=sensitive_features,
+        )
 
+        self.fairness_metrics = metricframe_unmitigated.by_group.to_dict()
+        return self.fairness_metrics
 
+    def plot_confusion_matrix(self, h=15, w=15):
+        assert self.metrics is not None, "You must evaluate the model with test data before plotting the results"
+        fig, ax = plt.subplots(figsize=(h, w))  # Sample figsize in inches
+        sns.set(font_scale=4)
+        sns.heatmap(self.metrics["confusion_matrix"], annot=True, linewidths=.5, fmt='g', ax=ax)
+        plt.show()
diff --git a/backend/src/common/RFClassifierPipeline.py b/backend/src/common/RFClassifierPipeline.py
@@ -0,0 +1,81 @@
+import pandas as pd
+import logging
+import numpy as np
+from src.common.BasePipeline import BasePipeline
+from src.resources.conf import INPUT_FEATURES, OUTPUT_FEATURE, EXPLAINABLE_CATEGORIES, SENSITIVE_FEATURE
+import datetime
+
+logging.getLogger(__name__)
+
+
+class RFClassifierPipeline(BasePipeline):
+    def __init__(self, model_file_name,
+                 model_training=False, model_params={},
+                 **kwargs):
+        super(RFClassifierPipeline, self).__init__(model_file_name=model_file_name, model_training=model_training)
+        self.model_params = model_params
+        if model_training:
+            self.pipeline = None
+        else:
+            self.load_pipeline()
+
+    def explain(self, transaction_sample: np.ndarray, ) -> dict:
+        def get_feature_name(feature_array_exp):
+            for element in feature_array_exp:
+                if element in INPUT_FEATURES:
+                    return element
+            return feature_array_exp[0]
+
+        def get_explainable_group(feature_name):
+            for key in EXPLAINABLE_CATEGORIES:
+                if feature_name in EXPLAINABLE_CATEGORIES[key]:
+                    return key
+            return "general_evidences"
+
+        if self.explainer is not None:
+            predict_fn_rf = lambda x: self.pipeline.predict_proba(x).astype(float)
+            exp = self.explainer.explain_instance(transaction_sample, predict_fn_rf, num_features=100)
+            explanability_scores = {
+                "ip_risk": 0.5,
+                "email_risk": 0.5,
+                "risk_card_behaviour": 0.5,
+                "risk_card_amount": 0.5,
+                "general_evidences": 0.5
+            }
+            for feature in exp.as_list():
+                feature_name = get_feature_name(feature[0].split(" "))
+                score = feature[1]
+                explainable_group = get_explainable_group(feature_name)
+                explanability_scores[explainable_group] += score
+            return explanability_scores
+        raise RuntimeError("explainer needs to be loaded first by invoking load_explainer method")
+
+
+if __name__ == '__main__':
+    # load test data
+    df_test = pd.read_csv("../resources/test_dataset_december.csv")
+    X_test = df_test[INPUT_FEATURES]
+    y_test = df_test[OUTPUT_FEATURE]
+    A_test = df_test[SENSITIVE_FEATURE]
+
+    # evaluate
+    pipeline = RFClassifierPipeline(model_file_name="../resources/pretrained_models/RandomForest.pickle")
+    pipeline.load_explainer("../resources/pretrained_models/RandomForest_LIME.pickle")
+    metrics = pipeline.eval(X_test, y_test)
+    fairness_metrics = pipeline.eval_fairness(X_test, y_test, A_test)
+    print({**metrics, **fairness_metrics})
+
+    # plot confusion matrix
+    pipeline.plot_confusion_matrix()
+
+    # explain the model
+    transaction_sample = X_test.sample().values[0]
+    explanability_scores = pipeline.explain(transaction_sample)
+    print(explanability_scores)
+
+    # produce prediction probability results with psp_reference
+    y_predict_proba = pd.Series(pipeline.predict_proba(X_test), name="predict_proba")
+    df_pred_prob = pd.concat([df_test["psp_reference"], y_predict_proba], axis=1)
+    df_pred_prob["created_at"] = pd.Series([datetime.datetime.now()] * df_pred_prob.shape[0])
+    df_pred_prob["updated_at"] = pd.Series([datetime.datetime.now()] * df_pred_prob.shape[0])
+    df_pred_prob.to_csv("../../predictions_dump_december.csv", index=False)
diff --git a/backend/src/common/XGBClassifierPipeline.py b/backend/src/common/XGBClassifierPipeline.py
@@ -1,11 +1,7 @@
-import numpy as np
 import pandas as pd
 import logging
-import os
-import matplotlib.pyplot as plt
-import seaborn as sns
-import pickle
 from src.common.BasePipeline import BasePipeline
+from src.resources.conf import INPUT_FEATURES, OUTPUT_FEATURE
 import datetime
 
 logging.getLogger(__name__)
@@ -21,58 +17,18 @@ def __init__(self, model_file_name, model_training=False, model_params={},
             self.load_pipeline()
         self.model_params = model_params
 
-    def predict(self, X_test: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
-        y_predict = (self.pipeline.predict_proba(X_test.copy())[:, 1] >= threshold).astype(bool)
-        return y_predict
-
-    def load_pipeline(self, **kwargs) -> None:
-        dir = os.path.dirname(os.path.abspath(__file__))
-        fname = os.path.join(dir, self.model_file_name)
-        with open(fname, 'rb') as handle:
-            pickled_model = pickle.load(handle)
-            self.pipeline = pickled_model
-
-    def save_pipeline(self) -> None:
-        dir = os.path.dirname(os.path.abspath(__file__))
-        fname = os.path.join(dir, self.model_file_name)
-        with open(fname, 'wb') as handle:
-            pickle.dump(self.pipeline, handle)
-
-    def eval(self, X_test: pd.DataFrame, y_test: pd.DataFrame, threshold: float = 0.5):
-        predicted = self.predict(X_test=X_test.copy(), threshold=threshold)
-        self.metrics_sklearn(y_true=y_test, y_pred=predicted)
-        return self.metrics
-
-    def plot_confusion_matrix(self, h=15, w=15):
-        assert self.metrics is not None, "You must evaluate the model with test data before plotting the results"
-        fig, ax = plt.subplots(figsize=(h, w))  # Sample figsize in inches
-        sns.set(font_scale=4)
-        sns.heatmap(self.metrics["confusion_matrix"], annot=True, linewidths=.5, fmt='g', ax=ax)
-        plt.show()
-
-    def predict_proba(self, X_test: pd.DataFrame):
-        """
-        :param X_test:
-        :return: probability of being positive
-        """
-        return self.pipeline.predict_proba(X_test.copy())[:, 1]
+    def explain(self):
+        raise NotImplementedError
 
 
 if __name__ == '__main__':
     # load test data
     df_test = pd.read_csv("../resources/test_dataset_december.csv")
-    columns = ['ip_node_degree', 'card_node_degree', 'email_node_degree', 'is_credit',
-               'ip_address_woe', 'email_address_woe', 'card_number_woe', 'no_ip',
-               'no_email', 'same_country', 'merchant_Merchant B',
-               'merchant_Merchant C', 'merchant_Merchant D', 'merchant_Merchant E',
-               'card_scheme_MasterCard', 'card_scheme_Other', 'card_scheme_Visa',
-               'device_type_Linux', 'device_type_MacOS', 'device_type_Other',
-               'device_type_Windows', 'device_type_iOS', 'shopper_interaction_POS']
-    X_test = df_test[columns]
-    y_test = df_test["has_fraudulent_dispute"]
+    X_test = df_test[INPUT_FEATURES]
+    y_test = df_test[OUTPUT_FEATURE]
 
     # evaluate
-    pipeline = XGBClassifierPipeline(model_file_name="../resources/pretrained_models/xgboost_classifier_model.pkl")
+    pipeline = XGBClassifierPipeline(model_file_name="../resources/pretrained_models/XGBoost.pickle")
     metrics = pipeline.eval(X_test, y_test)
     print(metrics)
 
@@ -84,4 +40,4 @@ def predict_proba(self, X_test: pd.DataFrame):
     df_pred_prob = pd.concat([df_test["psp_reference"], y_predict_proba], axis=1)
     df_pred_prob["created_at"] = pd.Series([datetime.datetime.now()] * df_pred_prob.shape[0])
     df_pred_prob["updated_at"] = pd.Series([datetime.datetime.now()] * df_pred_prob.shape[0])
-    df_pred_prob.to_csv("../../predictions_dump.csv", index=False)
+    df_pred_prob.to_csv("../../predictions_dump_december.csv", index=False)