joshuawe · joshuawe · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -55,11 +55,6 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Debugging
-        run: |
-          ls -la
-          cat Makefile
-          make virtualenv
       - name: Install project
         run: |
           make virtualenv

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -21,5 +21,5 @@ jobs:
         pip install -r requirements.txt
     - name: Analysing the code with pylint
       run: |
-        pylint --fail-under=4 $(git ls-files '*.py')
+        pylint --fail-under=6 $(git ls-files '*.py')
         # pylint $(git ls-files '*.py')
diff --git a/README.md b/README.md
@@ -55,9 +55,9 @@ These could include visualizing the results for a binary classifier, for which p
 |:--------------------------------------------------:|:----------------------------------------------------------:|:-------------------------------------------------:|
 |                    Calibration Curve               |                  Classification Report                     |                 Confusion Matrix                 |
 
-| <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/roc_curve.png?raw=true" width="300" alt="Your Image">        | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/roc_curve_bootstrap.png?raw=true" width="300" alt="Your Image">        | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/y_prob_histogram.png?raw=true" width="300" alt="Your Image">  |
+| <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/roc_curve_bootstrap.png?raw=true" width="300" alt="Your Image">        | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/pr_curve.png?raw=true" width="300" alt="Your Image">        | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/y_prob_histogram.png?raw=true" width="300" alt="Your Image">  |
 |:--------------------------------------------------:|:----------------------------------------------------------:|:-------------------------------------------------:|
-|                    ROC Curve (AUROC)               |                  ROC Curve (AUROC) with bootstrapping                          |                  y_prob histogram                                 |
+|                    ROC Curve (AUROC) with bootstrapping             |                 Precision-Recall Curve                          |                  y_prob histogram                                 |
 
 
 | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/raincloud.png?raw=true" width="300" alt="Your Image">        |  <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7" width="300" height="300" alt=""> | <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7" width="300" height="300" alt=""> |
@@ -82,7 +82,7 @@ Install the package via pip.
 pip install plotsandgraphs
 ```
 
-Alternativelynstall the package from git.
+Alternatively install the package from git.
 ```bash
 git clone https://github.com/joshuawe/plots_and_graphs
 cd plots_and_graphs

diff --git a/images/pr_curve.png b/images/pr_curve.png
diff --git a/images/y_prob_histogram.png b/images/y_prob_histogram.png
diff --git a/plotsandgraphs/binary_classifier.py b/plotsandgraphs/binary_classifier.py
@@ -1,7 +1,8 @@
+from pathlib import Path
+from typing import Optional
 import matplotlib.pyplot as plt
 from matplotlib.colors import to_rgba
 from matplotlib.figure import Figure
-import seaborn as sns
 import numpy as np
 import pandas as pd
 from sklearn.metrics import (
@@ -15,9 +16,7 @@
 )
 from sklearn.calibration import calibration_curve
 from sklearn.utils import resample
-from pathlib import Path
 from tqdm import tqdm
-from typing import Optional
 
 
 def plot_accuracy(y_true, y_pred, name="", save_fig_path=None) -> Figure:
@@ -28,27 +27,25 @@
    # for t in range(max_seq_len):
    #     accuracy += accuracy_score( y[:,t,0].round()  , y_pred[:,t] )
    # accuracy = accuracy / max_seq_len
    fig = plt.figure(figsize=(4, 5))
    plt.bar(np.array([0]), np.array([accuracy]))
    # axs[0].set_xticks(ticks=range(2))
    # axs[0].set_xticklabels(["train", "test"])
    plt.ylabel("Accuracy")
    plt.ylim([0, 1])
    # axs[0].set_xlabel('Features')
    title = "Predictor model: {}".format(name)
     plt.title(title)
     plt.tight_layout()
 
-    if save_fig_path != None:
+    if save_fig_path is not None:
         path = Path(save_fig_path)
         path.parent.mkdir(parents=True, exist_ok=True)
         fig.savefig(save_fig_path, bbox_inches="tight")
-    return fig, accuracy
+    return fig
 
 
-def plot_confusion_matrix(
-    y_true: np.ndarray, y_pred: np.ndarray, save_fig_path=None
-) -> Figure:
+def plot_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, save_fig_path=None) -> Figure:
     import matplotlib.colors as colors
 
     # Compute the confusion matrix
@@ -57,16 +54,14 @@
     cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
 
     # Create the ConfusionMatrixDisplay instance and plot it
-    cmd = ConfusionMatrixDisplay(
-        cm, display_labels=["class 0\nnegative", "class 1\npositive"]
-    )
+    cmd = ConfusionMatrixDisplay(cm, display_labels=["class 0\nnegative", "class 1\npositive"])
     fig, ax = plt.subplots(figsize=(4, 4))
     cmd.plot(
         cmap="YlOrRd",
         values_format="",
         colorbar=False,
         ax=ax,
-        text_kw={"visible": False},
+        # text_kw={"visible": False},
     )
     cmd.texts_ = []
     cmd.text_ = []
@@ -106,7 +101,7 @@
     cbar.outline.set_visible(False)
     plt.tight_layout()
 
-    if save_fig_path != None:
+    if save_fig_path is not None:
         path = Path(save_fig_path)
         path.parent.mkdir(parents=True, exist_ok=True)
         fig.savefig(save_fig_path, bbox_inches="tight")
@@ -115,7 +110,7 @@
 
 
 def plot_classification_report(
-    y_test: np.ndarray,
+    y_true: np.ndarray,
     y_pred: np.ndarray,
     title="Classification Report",
     figsize=(8, 4),
@@ -152,32 +147,27 @@
     import matplotlib as mpl
     import matplotlib.colors as colors
     import seaborn as sns
-    import pathlib
 
     fig, ax = plt.subplots(figsize=figsize)
 
     cmap = "YlOrRd"
 
-    clf_report = classification_report(y_test, y_pred, output_dict=True, **kwargs)
-    keys_to_plot = [
-        key
-        for key in clf_report.keys()
-        if key not in ("accuracy", "macro avg", "weighted avg")
-    ]
+    clf_report = classification_report(y_true, y_pred, output_dict=True, **kwargs)
+    keys_to_plot = [key for key in clf_report.keys() if key not in ("accuracy", "macro avg", "weighted avg")]
     df = pd.DataFrame(clf_report, columns=keys_to_plot).T
     # the following line ensures that dataframe are sorted from the majority classes to the minority classes
     df.sort_values(by=["support"], inplace=True)

    # first, let's plot the heatmap by masking the 'support' column
    rows, cols = df.shape
    mask = np.zeros(df.shape)
     mask[:, cols - 1] = True
 
     bounds = np.linspace(0, 1, 11)
-    cmap = plt.cm.get_cmap("YlOrRd", len(bounds) + 1)
-    norm = colors.BoundaryNorm(bounds, cmap.N) # type: ignore[attr-defined]
+    cmap = plt.cm.get_cmap("YlOrRd", len(bounds) + 1)  # type: ignore[assignment]
+    norm = colors.BoundaryNorm(bounds, cmap.N)  # type: ignore[attr-defined]
 
     ax = sns.heatmap(
         df,
        mask=mask,
        annot=False,
@@ -190,16 +180,16 @@
        linecolor="white",
    )
    cbar = ax.collections[0].colorbar
    cbar.ax.yaxis.set_ticks_position("both")

    cmap_min, cmap_max = cbar.cmap(0), cbar.cmap(1.0)

    # add text annotation to heatmap
    dx, dy = 0.5, 0.5
    for i in range(rows):
        for j in range(cols - 1):
            text = f"{df.iloc[i, j]:.2%}"  # if (j<cols) else f"{df.iloc[i, j]:.0f}"
            ax.text(
                j + dx,
                i + dy,
                text,
@@ -211,9 +201,9 @@

    # then, let's add the support column by normalizing the colors in this column
    mask = np.zeros(df.shape)
    mask[:, : cols - 1] = True

    ax = sns.heatmap(
        df,
        mask=mask,
        annot=False,
@@ -229,10 +219,10 @@

    cmap_min, cmap_max = cbar.cmap(0), cbar.cmap(1.0)
    for i in range(rows):
        j = cols - 1
        text = f"{df.iloc[i, j]:.0f}"  # if (j<cols) else f"{df.iloc[i, j]:.0f}"
        color = (df.iloc[i, j]) / (df["support"].sum())
        ax.text(
            j + dx,
            i + dy,
            text,
@@ -243,16 +233,16 @@
        )

    plt.title(title)
    plt.xticks(rotation=45)
     plt.yticks(rotation=360)
     plt.tight_layout()
 
-    if save_fig_path != None:
+    if save_fig_path is not None:
         path = Path(save_fig_path)
         path.parent.mkdir(parents=True, exist_ok=True)
         fig.savefig(save_fig_path, bbox_inches="tight")

    return fig, ax


 def plot_roc_curve(
@@ -332,9 +322,7 @@
             auc_upper = np.quantile(bootstrap_aucs, CI_upper)
             auc_lower = np.quantile(bootstrap_aucs, CI_lower)
             label = f"{confidence_interval:.0%} CI: [{auc_lower:.2f}, {auc_upper:.2f}]"
-            plt.fill_between(
-                base_fpr, tprs_lower, tprs_upper, alpha=0.3, label=label, zorder=2
-            )
+            plt.fill_between(base_fpr, tprs_lower, tprs_upper, alpha=0.3, label=label, zorder=2)
 
         if highlight_roc_area is True:
             print(
@@ -366,9 +354,7 @@
     return fig
 
 
-def plot_calibration_curve(
-    y_prob: np.ndarray, y_true: np.ndarray, n_bins=10, save_fig_path=None
-) -> Figure:
+def plot_calibration_curve(y_prob: np.ndarray, y_true: np.ndarray, n_bins=10, save_fig_path=None) -> Figure:
     """
     Creates calibration plot for a binary classifier and calculates the ECE.
 
@@ -390,26 +376,24 @@
     ece : float
         The expected calibration error.
     """
-    prob_true, prob_pred = calibration_curve(
-        y_true, y_prob, n_bins=n_bins, strategy="uniform"
-    )
+    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy="uniform")
 
     # Find the number of samples in each bin
     bin_counts = np.histogram(y_prob, bins=n_bins, range=(0, 1))[0]

    # Calculate the weighted absolute difference (ECE)
    ece = np.abs(prob_pred - prob_true) * (bin_counts / len(y_prob))
    ece = ece.sum().round(2)

    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(111)

    # Evenly spaced bar locations on the x-axis and reduced bar width for spacing
    bar_centers = np.linspace(0, 1, n_bins, endpoint=False) + 0.5 / n_bins
    bar_width = 1.0 / n_bins  # * 0.9  # 90% of the bin width to create gaps

    # Plotting
    ax.bar(
        bar_centers,
        prob_true,
        width=bar_width,
@@ -420,7 +404,7 @@
        linewidth=2,
        label=f"True Calibration",
    )
    ax.bar(
        bar_centers,
        prob_pred - prob_true,
        bottom=prob_true,
@@ -434,7 +418,7 @@
        label=f"Mean ECE = {ece}",
        hatch="//",
    )
    ax.plot(
        [0, 1],
        [0, 1],
        linestyle="--",
@@ -444,28 +428,28 @@
    )

    # Labels and titles
    ax.set(xlabel="Predicted probability", ylabel="True probability")
    plt.xlim([0.0, 1.005])
    plt.ylim([-0.01, 1.0])
    ax.legend(loc="upper left", frameon=False)

    # show y-grid
    ax.spines[:].set_visible(False)
    ax.grid(True, linestyle="-", linewidth=0.5, color="grey", alpha=0.5)
    ax.set_yticks(np.arange(0, 1.1, 0.2))
    ax.set_xticks(np.arange(0, 1.1, 0.2))
    plt.tight_layout()

    # save plot
    if save_fig_path is not None:
        path = Path(save_fig_path)
        path.parent.mkdir(parents=True, exist_ok=True)
        fig.savefig(save_fig_path, bbox_inches="tight")

     return fig
 
 
-def plot_y_prob_histogram(y_prob: np.ndarray, y_true: Optional[np.ndarray]=None, save_fig_path=None) -> Figure:
+def plot_y_prob_histogram(y_prob: np.ndarray, y_true: Optional[np.ndarray] = None, save_fig_path=None) -> Figure:
     """
     Provides a histogram for the predicted probabilities of a binary classifier. If ```y_true``` is provided, it divides the ```y_prob``` values into the two classes and plots them jointly into the same plot with different colors.
 
@@ -485,16 +469,32 @@
     """
     fig = plt.figure(figsize=(5, 5))
     ax = fig.add_subplot(111)
-    
+
     if y_true is None:
         ax.hist(y_prob, bins=10, alpha=0.9, edgecolor="midnightblue", linewidth=2, rwidth=1)
         # same histogram as above, but with border lines
         # ax.hist(y_prob, bins=10, alpha=0.5, edgecolor='black', linewidth=1.2)
     else:
         alpha = 0.6
-        ax.hist(y_prob[y_true==0], bins=10, alpha=alpha, edgecolor="midnightblue", linewidth=2, rwidth=1, label="$\\hat{y} = 0$")
-        ax.hist(y_prob[y_true==1], bins=10, alpha=alpha, edgecolor="darkred", linewidth=2, rwidth=1, label="$\\hat{y} = 1$")
-
+        ax.hist(
+            y_prob[y_true == 0],
+            bins=10,
+            alpha=alpha,
+            edgecolor="midnightblue",
+            linewidth=2,
+            rwidth=1,
+            label="$\\hat{y} = 0$",
+        )
+        ax.hist(
+            y_prob[y_true == 1],
+            bins=10,
+            alpha=alpha,
+            edgecolor="darkred",
+            linewidth=2,
+            rwidth=1,
+            label="$\\hat{y} = 1$",
+        )
+
         plt.legend()
     ax.set(xlabel="Predicted probability [-]", ylabel="Count [-]", xlim=(-0.01, 1.0))
     ax.set_title("Histogram of predicted probabilities")
@@ -505,7 +505,7 @@
     plt.tight_layout()
 
     # save plot
-    if save_fig_path != None:
+    if save_fig_path is not None:
         path = Path(save_fig_path)
         path.parent.mkdir(parents=True, exist_ok=True)
         fig.savefig(save_fig_path, bbox_inches="tight")

diff --git a/plotsandgraphs/compare_distributions.py b/plotsandgraphs/compare_distributions.py
@@ -1,8 +1,8 @@
+from typing import List, Tuple, Optional
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib as mpl
 import pandas as pd
-from typing import List, Tuple, Optional
 
 
 def plot_raincloud(
@@ -46,15 +46,15 @@

    # if colors are none, use distinct colors for each group
    if colors is None:
        cmap = plt.get_cmap("tab10")
        colors = [mpl.colors.to_hex(cmap(i)) for i in np.linspace(0, 1, len(order))]
    else:
        assert len(colors) == len(order), "colors and order must be the same length"
        colors = colors

    # Boxplot
    if show_boxplot:
        bp = ax.boxplot(
            [df[df[y_col] == grp][x_col].values for grp in order],
            patch_artist=True,
            vert=False,
@@ -63,17 +63,17 @@
        )

        # Customize boxplot colors
        for patch, color in zip(bp["boxes"], colors):
            patch.set_facecolor(color)
            patch.set_alpha(0.8)

        # Set median line color to black
        for median in bp["medians"]:
            median.set_color("black")

    # Violinplot
    if show_violin:
        vp = ax.violinplot(
            [df[df[y_col] == grp][x_col].values for grp in order],
            positions=np.arange(1 + offset, len(order) + 1 + offset),
            showmeans=False,
@@ -83,8 +83,8 @@
        )

        # Customize violinplot colors
        for idx, b in enumerate(vp["bodies"]):
            b.get_paths()[0].vertices[:, 1] = np.clip(
                b.get_paths()[0].vertices[:, 1], idx + 1 + offset, idx + 2 + offset
            )
            b.set_color(colors[idx])
@@ -96,7 +96,7 @@
            y = np.full(len(features), idx + 1 - offset)
            jitter_amount = 0.12
            y += np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(y))
            plt.scatter(features, y, s=10, c=colors[idx], alpha=0.3, facecolors="none")

    # Labels
    plt.yticks(np.arange(1, len(order) + 1), order)
@@ -105,10 +105,10 @@
        x_label = x_col
    plt.xlabel(x_label)
    if title:
        plt.title(title + "\n")
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.xaxis.grid(True)

    if x_range:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,5 +4,8 @@ max-line-length = 120
 [tool.pylint."BASIC"]
 variable-rgx = "[a-z_][a-z0-9_]{0,30}$|[a-z0-9_]+([A-Z][a-z0-9_]+)*$"  # Allow snake case and camel case for variable names
 
+[tool.pylint."MESSAGES CONTROL"]
+disable = "W0621"  # Allow redefining names in outer scope
+
 [flake8]
 max-line-length = 120
diff --git a/src/binary_classifier.py b/src/binary_classifier.py
@@ -1,15 +1,14 @@
+from typing import Optional
+from pathlib import Path
 import matplotlib.pyplot as plt
 from matplotlib.colors import to_rgba
 from matplotlib.figure import Figure
-import seaborn as sns
 import numpy as np
 import pandas as pd
 from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, auc, accuracy_score, precision_recall_curve
 from sklearn.calibration import calibration_curve
 from sklearn.utils import resample
-from pathlib import Path
 from tqdm import tqdm
-from typing import Optional
 
 
 def plot_accuracy(y_true, y_pred, name='', save_fig_path=None) -> Figure:
@@ -381,7 +380,7 @@ def plot_y_prob_histogram(y_prob: np.ndarray, save_fig_path=None) -> Figure:
     plt.tight_layout()
 
     # save plot
-    if (save_fig_path != None):
+    if (save_fig_path is not None):
         path = Path(save_fig_path)
         path.parent.mkdir(parents=True, exist_ok=True)
         fig.savefig(save_fig_path, bbox_inches='tight')

diff --git a/src/compare_distributions.py b/src/compare_distributions.py
@@ -7,14 +7,14 @@
 
 def plot_raincloud(df: pd.DataFrame,
                    x_col: str,
-                   y_col: str, 
-                   colors: List[str] = None, 
-                   order: List[str] = None, 
-                   title: str = None, 
-                   x_label: str = None, 
-                   x_range: Tuple[float, float] = None, 
-                   show_violin = True, 
-                   show_scatter = True, 
+                   y_col: str,
+                   colors: List[str] = None,
+                   order: List[str] = None,
+                   title: str = None,
+                   x_label: str = None,
+                   x_range: Tuple[float, float] = None,
+                   show_violin = True,
+                   show_scatter = True,
                    show_boxplot = True):
 
     """
@@ -49,7 +49,6 @@ def plot_raincloud(df: pd.DataFrame,
         colors = [mpl.colors.to_hex(cmap(i)) for i in np.linspace(0, 1, len(order))]
     else:
         assert len(colors) == len(order), 'colors and order must be the same length'
-        colors = colors
 
     # Boxplot
     if show_boxplot:

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1,7 @@
+import os
+
+TEST_RESULTS_PATH = os.path.join(os.path.dirname(__file__), "test_results")
+
+# print cwd in console
+
+# print os.path.dirname(__file__)