Skip to content

Commit

Permalink
0.68.0
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Nov 7, 2023
1 parent 1997317 commit 27f99fb
Show file tree
Hide file tree
Showing 8 changed files with 849 additions and 93 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
Changelog
=========

Version 0.68.0
--------------
* added stratification framework for split balancing

Version 0.67.0
---------------
--------------
* added first version of spotlight integration

Version 0.66.13
Expand Down
4 changes: 2 additions & 2 deletions nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.67.0"
SAMPLING_RATE = 16000
VERSION="0.68.0"
SAMPLING_RATE = 16000
216 changes: 126 additions & 90 deletions nkululeko/data/dataset.py

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions nkululeko/split/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from nkululeko.constants import VERSION

__version__ = VERSION
27 changes: 27 additions & 0 deletions nkululeko/split/example_binning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
Code copyright by Uwe Reichel
"""

import numpy as np
from split_utils import binning, optimize_traindevtest_split

np.random.seed(42)
y = np.random.rand(10)

# intrinsic binning by equidistant percentiles
yci = binning(y, nbins=3)

# extrinsic binning by explicit lower boundaries
yce = binning(y, lower_boundaries=[0, 0.3, 0.8])

print("yci:", yci)
print("yce:", yce)

"""
yci: [0 2 2 1 0 0 0 2 1 2]
yce: [1 2 1 1 0 0 0 2 1 1]
now yci or yce can be used for stratification, e.g.
stratify_on = {"target": yci, ...}
... = optimize_traindevtest_split(..., y=y, stratify_on=stratify_on, ...)
"""
81 changes: 81 additions & 0 deletions nkululeko/split/example_trainDevTestSplit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Code copyright by Uwe Reichel
"""

# import json
import pandas as pd
import audb
from split_utils import optimize_traindevtest_split

# define train/dev/testset split on emodb, that is:
# - speaker disjunct
# - optimally stratified on emotion
# - optimally stratified on gender
# - optimally stratified on transcriptions
# - that contains 10% of the speakers in both dev and test set
# - and approximately 10% of the files in both dev and test set


# data
db = audb.load(
"emodb", version="1.3.0", format="wav", sampling_rate=16000, mixdown=True
)
df_emotion = db["emotion"].get()
df_files = db["files"].get()
df_speaker = db["speaker"].get()

df = pd.concat([df_emotion, df_files], axis=1, join="inner")


def spk2gender(x):
if x in [8, 9, 13, 14, 16]:
return "female"
return "male"


df["gender"] = df["speaker"].map(spk2gender)

# seed, dev and test proportion, number of different splits
seed = 42
dev_size = 0.2
test_size = 0.2
k = 30

# targets
emotion = df["emotion"].to_numpy()

# on which variable to split
speaker = df["speaker"].to_numpy()

# on which variables (targets, groupings) to stratify
stratif_vars = {
"emotion": emotion,
"gender": df["gender"].to_numpy(),
"transcription": df["transcription"].to_numpy(),
}

# weights for all stratify_on variables and
# and for dev and test proportion match. Give target
# variable EMOTION more weight than groupings.
weight = {"emotion": 2, "gender": 1, "transcription": 1, "size_diff": 1}

# find optimal dev and test indices DEV_I and TEST_I in DF
# info: dict with goodness of split information
train_i, dev_i, test_i, info = optimize_traindevtest_split(
X=df,
y=emotion,
split_on=speaker,
stratify_on=stratif_vars,
weight=weight,
dev_size=dev_size,
test_size=test_size,
k=k,
seed=seed,
)

print("dev split of DF:")
print(df.iloc[dev_i])
print("dev split of target variable:")
print(emotion[dev_i])
print("goodness of split:")
print(info)
77 changes: 77 additions & 0 deletions nkululeko/split/example_trainTestSplit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Code copyright by Uwe Reichel
"""

import pandas as pd
import audb
from split_utils import optimize_traintest_split

# define testset on emodb, that is:
# - speaker disjunct
# - optimally stratified on emotion
# - optimally stratified on gender
# - optimally stratified on transcriptions
# - that contains 10% of the speakers
# - and approximately 10% of the files

# data
db = audb.load(
"emodb", version="1.3.0", format="wav", sampling_rate=16000, mixdown=True
)
df_emotion = db["emotion"].get()
df_files = db["files"].get()
df_speaker = db["speaker"].get()
df = pd.concat([df_emotion, df_files], axis=1, join="inner")


def spk2gender(x):
if x in [8, 9, 13, 14, 16]:
return "female"
return "male"


df["gender"] = df["speaker"].map(spk2gender)

# seed, test proportion, number of different splits
seed = 42
test_size = 0.2
k = 30

# targets
emotion = df["emotion"].to_numpy()

# on which variable to split

speaker = df["speaker"].to_numpy()

# on which variables (targets, groupings) to stratify
stratif_vars = {
"emotion": emotion,
"gender": df["gender"].to_numpy(),
"transcription": df["transcription"].to_numpy(),
}

# weights for all stratify_on variables and
# and for test proportion match. Give target
# variable EMOTION more weight than groupings.
weight = {"emotion": 2, "gender": 1, "transcription": 1, "size_diff": 1}

# find optimal test indices TEST_I in DF
# info: dict with goodness of split information
train_i, test_i, info = optimize_traintest_split(
X=df,
y=emotion,
split_on=speaker,
stratify_on=stratif_vars,
weight=weight,
test_size=test_size,
k=k,
seed=seed,
)

print("test split of DF:")
print(df.iloc[test_i])
print("test split of target variable:")
print(emotion[test_i])
print("goodness of split:")
print(info)
Loading

0 comments on commit 27f99fb

Please sign in to comment.