0.68.0

bagustris · Nov 7, 2023 · 27f99fb · 27f99fb
1 parent 1997317
commit 27f99fb
Show file tree

Hide file tree

Showing 8 changed files with 849 additions and 93 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,12 @@
 Changelog
 =========
 
+Version 0.68.0
+--------------
+* added stratification framework for split balancing
+
 Version 0.67.0
----------------
+--------------
 * added first version of spotlight integration
 
 Version 0.66.13

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.67.0"
-SAMPLING_RATE = 16000
+VERSION="0.68.0"
+SAMPLING_RATE = 16000
diff --git a/nkululeko/data/dataset.py b/nkululeko/data/dataset.py
diff --git a/nkululeko/split/__init__.py b/nkululeko/split/__init__.py
@@ -0,0 +1,3 @@
+from nkululeko.constants import VERSION
+
+__version__ = VERSION
diff --git a/nkululeko/split/example_binning.py b/nkululeko/split/example_binning.py
@@ -0,0 +1,27 @@
+"""
+Code copyright by Uwe Reichel
+"""
+
+import numpy as np
+from split_utils import binning, optimize_traindevtest_split
+
+np.random.seed(42)
+y = np.random.rand(10)
+
+# intrinsic binning by equidistant percentiles
+yci = binning(y, nbins=3)
+
+# extrinsic binning by explicit lower boundaries
+yce = binning(y, lower_boundaries=[0, 0.3, 0.8])
+
+print("yci:", yci)
+print("yce:", yce)
+
+"""
+ yci: [0 2 2 1 0 0 0 2 1 2]
+ yce: [1 2 1 1 0 0 0 2 1 1]
+
+ now yci or yce can be used for stratification, e.g.
+stratify_on = {"target": yci, ...}
+... = optimize_traindevtest_split(..., y=y, stratify_on=stratify_on, ...)
+"""
diff --git a/nkululeko/split/example_trainDevTestSplit.py b/nkululeko/split/example_trainDevTestSplit.py
@@ -0,0 +1,81 @@
+"""
+Code copyright by Uwe Reichel
+"""
+
+# import json
+import pandas as pd
+import audb
+from split_utils import optimize_traindevtest_split
+
+# define train/dev/testset split on emodb, that is:
+#   - speaker disjunct
+#   - optimally stratified on emotion
+#   - optimally stratified on gender
+#   - optimally stratified on transcriptions
+#   - that contains 10% of the speakers in both dev and test set
+#   - and approximately 10% of the files in both dev and test set
+
+
+# data
+db = audb.load(
+    "emodb", version="1.3.0", format="wav", sampling_rate=16000, mixdown=True
+)
+df_emotion = db["emotion"].get()
+df_files = db["files"].get()
+df_speaker = db["speaker"].get()
+
+df = pd.concat([df_emotion, df_files], axis=1, join="inner")
+
+
+def spk2gender(x):
+    if x in [8, 9, 13, 14, 16]:
+        return "female"
+    return "male"
+
+
+df["gender"] = df["speaker"].map(spk2gender)
+
+# seed, dev and test proportion, number of different splits
+seed = 42
+dev_size = 0.2
+test_size = 0.2
+k = 30
+
+# targets
+emotion = df["emotion"].to_numpy()
+
+# on which variable to split
+speaker = df["speaker"].to_numpy()
+
+# on which variables (targets, groupings) to stratify
+stratif_vars = {
+    "emotion": emotion,
+    "gender": df["gender"].to_numpy(),
+    "transcription": df["transcription"].to_numpy(),
+}
+
+# weights for all stratify_on variables and
+# and for dev and test proportion match. Give target
+# variable EMOTION more weight than groupings.
+weight = {"emotion": 2, "gender": 1, "transcription": 1, "size_diff": 1}
+
+# find optimal dev and test indices DEV_I and TEST_I in DF
+# info: dict with goodness of split information
+train_i, dev_i, test_i, info = optimize_traindevtest_split(
+    X=df,
+    y=emotion,
+    split_on=speaker,
+    stratify_on=stratif_vars,
+    weight=weight,
+    dev_size=dev_size,
+    test_size=test_size,
+    k=k,
+    seed=seed,
+)
+
+print("dev split of DF:")
+print(df.iloc[dev_i])
+print("dev split of target variable:")
+print(emotion[dev_i])
+print("goodness of split:")
+print(info)
diff --git a/nkululeko/split/example_trainTestSplit.py b/nkululeko/split/example_trainTestSplit.py
@@ -0,0 +1,77 @@
+"""
+Code copyright by Uwe Reichel
+"""
+
+import pandas as pd
+import audb
+from split_utils import optimize_traintest_split
+
+# define testset on emodb, that is:
+#   - speaker disjunct
+#   - optimally stratified on emotion
+#   - optimally stratified on gender
+#   - optimally stratified on transcriptions
+#   - that contains 10% of the speakers
+#   - and approximately 10% of the files
+
+# data
+db = audb.load(
+    "emodb", version="1.3.0", format="wav", sampling_rate=16000, mixdown=True
+)
+df_emotion = db["emotion"].get()
+df_files = db["files"].get()
+df_speaker = db["speaker"].get()
+df = pd.concat([df_emotion, df_files], axis=1, join="inner")
+
+
+def spk2gender(x):
+    if x in [8, 9, 13, 14, 16]:
+        return "female"
+    return "male"
+
+
+df["gender"] = df["speaker"].map(spk2gender)
+
+# seed, test proportion, number of different splits
+seed = 42
+test_size = 0.2
+k = 30
+
+# targets
+emotion = df["emotion"].to_numpy()
+
+# on which variable to split
+
+speaker = df["speaker"].to_numpy()
+
+# on which variables (targets, groupings) to stratify
+stratif_vars = {
+    "emotion": emotion,
+    "gender": df["gender"].to_numpy(),
+    "transcription": df["transcription"].to_numpy(),
+}
+
+# weights for all stratify_on variables and
+# and for test proportion match. Give target
+# variable EMOTION more weight than groupings.
+weight = {"emotion": 2, "gender": 1, "transcription": 1, "size_diff": 1}
+
+# find optimal test indices TEST_I in DF
+# info: dict with goodness of split information
+train_i, test_i, info = optimize_traintest_split(
+    X=df,
+    y=emotion,
+    split_on=speaker,
+    stratify_on=stratif_vars,
+    weight=weight,
+    test_size=test_size,
+    k=k,
+    seed=seed,
+)
+
+print("test split of DF:")
+print(df.iloc[test_i])
+print("test split of target variable:")
+print(emotion[test_i])
+print("goodness of split:")
+print(info)