Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply Viterbi algorithm to predict voiced/unvoiced state of every state based on confidence array #26

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ from scipy.io import wavfile
sr, audio = wavfile.read('/path/to/audiofile.wav')
time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True)
```
The Viterbi algorithm can also be used to predict which frames are unvoiced. The following commands will set the frequency of such frames to zero:
```python
is_voiced = crepe.predict_voicing(confidence)
frequency *= is_voiced
```

## Argmax-local Weighted Averaging

Expand Down
2 changes: 1 addition & 1 deletion crepe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .version import version as __version__
from .core import get_activation, predict, process_file
from .core import get_activation, predict, predict_voicing, process_file
14 changes: 13 additions & 1 deletion crepe/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@

def run(filename, output=None, model_capacity='full', viterbi=False,
save_activation=False, save_plot=False, plot_voicing=False,
no_centering=False, step_size=10, verbose=True):
apply_voicing=False, no_centering=False, step_size=10,
verbose=True):
"""
Collect the WAV files to process and run the model

Expand All @@ -36,6 +37,10 @@ def run(filename, output=None, model_capacity='full', viterbi=False,
Include a visual representation of the voicing activity detection in
the plot of the output activation matrix. False by default, only
relevant if save_plot is True.
apply_voicing : bool
Apply viterbi algorithm to predict for every frame whether it was
voiced or unvoiced. Zero out silent frames and save the resulting
frequency array to a .npy file.
no_centering : bool
Don't pad the signal, meaning frames will begin at their timestamp
instead of being centered around their timestamp (which is the
Expand Down Expand Up @@ -81,6 +86,7 @@ def run(filename, output=None, model_capacity='full', viterbi=False,
save_activation=save_activation,
save_plot=save_plot,
plot_voicing=plot_voicing,
apply_voicing=apply_voicing,
step_size=step_size,
verbose=verbose)

Expand Down Expand Up @@ -143,6 +149,11 @@ def main():
parser.add_argument('--plot-voicing', '-v', action='store_true',
help='Plot the voicing prediction on top of the '
'output activation matrix plot')
parser.add_argument('--apply-voicing', '-P', action='store_true',
help='Apply viterbi algorithm to predict for every '
'frame whether it was voiced or unvoiced. Zero '
'out silent frames and save the resulting '
'frequency array to a .npy file.')
parser.add_argument('--no-centering', '-n', action='store_true',
help="Don't pad the signal, meaning frames will begin "
"at their timestamp instead of being centered "
Expand All @@ -168,6 +179,7 @@ def main():
save_activation=args.save_activation,
save_plot=args.save_plot,
plot_voicing=args.plot_voicing,
apply_voicing=args.apply_voicing,
no_centering=args.no_centering,
step_size=args.step_size,
verbose=not args.quiet)
58 changes: 57 additions & 1 deletion crepe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,45 @@ def to_viterbi_cents(salience):
range(len(observations))])


def predict_voicing(confidence):
"""
Find the Viterbi path for voiced versus unvoiced frames.

Parameters
----------
confidence : np.ndarray [shape=(N,)]
voicing confidence array, i.e. the confidence in the presence of
a pitch

Returns
-------
voicing_states : np.ndarray [shape=(N,)]
HMM predictions for each frames state, 0 if unvoiced, 1 if
voiced
"""
from hmmlearn import hmm

# uniform prior on the voicing confidence
starting = np.array([0.5, 0.5])

# transition probabilities inducing continuous voicing state
transition = np.array([[0.99, 0.01], [0.01, 0.99]])

# mean and variance for unvoiced and voiced states
means = np.array([[0.0], [1.0]])
variances = np.array([[0.25], [0.25]])

# fix the model parameters because we are not optimizing the model
model = hmm.GaussianHMM(n_components=2)
model.startprob_, model.covars_, model.transmat_, model.means_, model.n_features = \
starting, variances, transition, means, 1

# find the Viterbi path
voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)])

return np.array(voicing_states)


def get_activation(audio, sr, model_capacity='full', center=True, step_size=10,
verbose=1):
"""
Expand Down Expand Up @@ -271,7 +310,8 @@ def predict(audio, sr, model_capacity='full',

def process_file(file, output=None, model_capacity='full', viterbi=False,
center=True, save_activation=False, save_plot=False,
plot_voicing=False, step_size=10, verbose=True):
plot_voicing=False, apply_voicing=False, step_size=10,
verbose=True):
"""
Use the input model to perform pitch estimation on the input file.

Expand Down Expand Up @@ -300,6 +340,10 @@ def process_file(file, output=None, model_capacity='full', viterbi=False,
Include a visual representation of the voicing activity detection in
the plot of the output activation matrix. False by default, only
relevant if save_plot is True.
apply_voicing : bool
Apply viterbi algorithm to predict for every frame whether it was
voiced or unvoiced. Zero out silent frames and save the resulting
frequency array to a .npy file.
step_size : int
The step size in milliseconds for running pitch estimation.
verbose : bool
Expand All @@ -323,6 +367,18 @@ def process_file(file, output=None, model_capacity='full', viterbi=False,
step_size=step_size,
verbose=1 * verbose)

# predict voiced and unvoiced states, zero out silent frames, and
# save the resulting frequency array to a .npy file
if apply_voicing:
is_voiced = predict_voicing(confidence)
voiced_frequency = frequency * is_voiced
voiced_frequency_path = output_path(file, ".voiced_frequency.npy",
output)
np.save(voiced_frequency_path, voiced_frequency)
if verbose:
print("CREPE: Saved the voiced frequency array at {}".format(
voiced_frequency_path))

# write prediction as TSV
f0_file = output_path(file, ".f0.csv", output)
f0_data = np.vstack([time, frequency, confidence]).transpose()
Expand Down