Skip to content

Commit

Permalink
Merge pull request #35 from Baukebrenninkmeijer/fix/re-enable-kde
Browse files Browse the repository at this point in the history
  • Loading branch information
Bauke Brenninkmeijer authored Aug 20, 2023
2 parents 9809c39 + cb4384e commit 96ad9b8
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 115 deletions.
116 changes: 59 additions & 57 deletions example_table_evaluator.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="table-evaluator",
version="v1.6.0",
version="v1.6.1",
author="Bauke Brenninkmeijer",
author_email="[email protected]",
description="A package to evaluate how close a synthetic data set is to real data.",
Expand Down
26 changes: 12 additions & 14 deletions table_evaluator/metrics.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import pandas as pd
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import scipy.stats as ss
from dython.nominal import theils_u, cramers_v
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import jensenshannon
from dython.nominal import cramers_v, theils_u
from joblib import Parallel, delayed
from typing import Dict, Any, List
from scipy.spatial.distance import jensenshannon
from scipy.stats import ks_2samp
from sklearn.metrics import mean_squared_error


def mean_absolute_error(y_true: np.ndarray, y_pred: np.ndarray):
"""
Expand Down Expand Up @@ -90,36 +92,32 @@ def column_correlations(dataset_a, dataset_b, categorical_columns, theil_u=True)

def js_distance_df(real: pd.DataFrame, fake: pd.DataFrame, numerical_columns: List) -> pd.DataFrame:
assert real.columns.tolist() == fake.columns.tolist(), f'Colums are not identical between `real` and `fake`. '
real_iter = real[numerical_columns].iteritems()
fake_iter = fake[numerical_columns].iteritems()
distances = Parallel(n_jobs=-1)(
delayed(jensenshannon_distance)
(colname, real_col, fake_col) for (colname, real_col), (_, fake_col) in zip(real_iter, fake_iter))
(col, real[col], fake[col]) for col in numerical_columns)

distances_df = pd.DataFrame(distances)
# distances_df = distances_df.append({'colname': 'mean', 'js_distance': distances_df.js_distance.mean()})
return distances_df.set_index('col_name')


def jensenshannon_distance(colname: str, real_col: pd.Series, fake_col: pd.Series, bins=25) -> Dict[str, Any]:
def jensenshannon_distance(colname: str, real_col: pd.Series, fake_col: pd.Series, bins: int = 25) -> Dict[str, Any]:
bins = min(bins, len(real_col))
binned_values_real, bins = pd.cut(real_col, bins=bins, retbins=True)
binned_probs_real = binned_values_real.value_counts(normalize=True, sort=False)
binned_probs_fake = pd.cut(fake_col, bins=bins).value_counts(normalize=True, sort=False)
js_distance = jensenshannon(binned_probs_real, binned_probs_fake)
return {'col_name': colname, 'js_distance': js_distance}


def kolmogorov_smirnov_test(col_name, real_col, fake_col):
def kolmogorov_smirnov_test(col_name, real_col, fake_col) -> Dict[str, Any]:
statistic, p_value = ks_2samp(real_col, fake_col)
equality = 'identical' if p_value > 0.01 else 'different'
return {'col_name': col_name, 'statistic': statistic, 'p-value': p_value, 'equality': equality}

def kolmogorov_smirnov_df(real: pd.DataFrame, fake: pd.DataFrame, numerical_columns: List) -> List[Dict[str, Any]]:
assert real.columns.tolist() == fake.columns.tolist(), f'Colums are not identical between `real` and `fake`. '
real_iter = real[numerical_columns].iteritems()
fake_iter = fake[numerical_columns].iteritems()
distances = Parallel(n_jobs=-1)(
delayed(kolmogorov_smirnov_test)
(colname, real_col, fake_col) for (colname, real_col), (_, fake_col) in zip(real_iter, fake_iter))
(col, real[col], fake[col]) for col in numerical_columns)
distances_df = pd.DataFrame(distances)
return distances_df.set_index('col_name')
30 changes: 17 additions & 13 deletions table_evaluator/table_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def plot_distributions(self, nr_cols=3, fname=None):
for i, col in enumerate(self.real.columns):
if col not in self.categorical_columns:
plot_df = pd.DataFrame({col: pd.concat([self.real[col], self.fake[col]], axis=0), 'kind': ['real'] * self.n_samples + ['fake'] * self.n_samples})
fig = sns.histplot(plot_df, x=col, hue='kind', ax=axes[i], stat='probability', legend=True)
fig = sns.histplot(plot_df, x=col, hue='kind', ax=axes[i], stat='probability', legend=True, kde=True)
axes[i].set_autoscaley_on(True)
else:
real = self.real.copy()
Expand Down Expand Up @@ -220,13 +220,12 @@ def custom_cosine(a, b):
else:
raise ValueError(f'`how` parameter must be in [euclidean, mae, rmse]')

real_corr = associations(self.real, nominal_columns=self.categorical_columns, nom_nom_assoc='theil', compute_only=True)
fake_corr = associations(self.fake, nominal_columns=self.categorical_columns, nom_nom_assoc='theil', compute_only=True)

real_corr = associations(self.real, nominal_columns=self.categorical_columns, nom_nom_assoc='theil', compute_only=True)['corr'] # type: ignore
fake_corr = associations(self.fake, nominal_columns=self.categorical_columns, nom_nom_assoc='theil', compute_only=True)['corr'] # type: ignore
return distance_func(
real_corr.values,
fake_corr.values
)
) # type: ignore

def plot_pca(self, fname=None):
"""
Expand Down Expand Up @@ -435,8 +434,9 @@ def correlation_correlation(self) -> float:
total_metrics = pd.DataFrame()
for ds_name in ['real', 'fake']:
ds = getattr(self, ds_name)
corr_df = associations(ds, nominal_columns=self.categorical_columns, nom_nom_assoc='theil', compute_only=True)
corr_df: pd.DataFrame = associations(ds, nominal_columns=self.categorical_columns, nom_nom_assoc='theil', compute_only=True)['corr']
values = corr_df.values
# print(values, type(values))
values = values[~np.eye(values.shape[0], dtype=bool)].reshape(values.shape[0], -1)
total_metrics[ds_name] = values.flatten()

Expand Down Expand Up @@ -470,14 +470,18 @@ def convert_numerical_one_hot(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
:return: Real and fake dataframe with categorical columns one-hot encoded and binary columns factorized.
"""
real = numerical_encoding(self.real, nominal_columns=self.categorical_columns)
columns = sorted(real.columns.tolist())
real = real[columns]
fake = numerical_encoding(self.fake, nominal_columns=self.categorical_columns)
for col in columns:
if col not in fake.columns.tolist():
real: pd.DataFrame = numerical_encoding(self.real, nominal_columns=self.categorical_columns)
real = real.sort_index(axis=1)
fake: pd.DataFrame = numerical_encoding(self.fake, nominal_columns=self.categorical_columns)
for col in real.columns:
if col not in fake:
fake[col] = 0
fake = fake[columns]
fake = fake.sort_index(axis=1)

# Cast True/False columns to 0/1.
bool_cols = real.select_dtypes('bool').columns
real[bool_cols] = real[bool_cols].astype(float)
fake[bool_cols] = fake[bool_cols].astype(float)

return real, fake

Expand Down
45 changes: 15 additions & 30 deletions tests/create_test_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"IPython not installed.\n"
]
}
],
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('..')\n",
Expand All @@ -26,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -35,7 +27,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand All @@ -44,7 +36,7 @@
"'2.0.3'"
]
},
"execution_count": 3,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -62,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -73,7 +65,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -90,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -100,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -117,7 +109,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -130,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 24,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -279,18 +271,18 @@
"trans_date 0.031496 0.093090 0.107945 1.000000 "
]
},
"execution_count": 11,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_parquet(test_data_folder/'real_associations.csv')"
"pd.read_parquet(test_data_folder/'real_associations.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -307,7 +299,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -464,13 +456,6 @@
"source": [
"pd.read_parquet(test_data_folder/'real_associations_theil.parquet')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
15 changes: 15 additions & 0 deletions tests/metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,18 @@ def test_numerical_encoding():
num_encoding = numerical_encoding(fake, nominal_columns=cat_cols)
stored_encoding = pd.read_parquet(test_data_folder/'fake_test_sample_numerical_encoded.parquet')
pd.testing.assert_frame_equal(num_encoding, stored_encoding)


def test_jensenshannon_distance():
# create some sample data
colname = "age"
real_col = pd.Series([20, 25, 30, 35, 40])
fake_col = pd.Series([22, 27, 32, 37, 42])

# call the function and get the result
result = jensenshannon_distance(colname, real_col, fake_col)

# check that the result is a dictionary with the correct keys and values
assert isinstance(result, dict)
assert result["col_name"] == colname
assert result["js_distance"] == 0.2736453208486386 # this is the expected JS distance for these data

0 comments on commit 96ad9b8

Please sign in to comment.