diff --git a/README.md b/README.md index fcd98ee..032a799 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,11 @@ from scipy.io import wavfile sr, audio = wavfile.read('/path/to/audiofile.wav') time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True) ``` +The Viterbi algorithm can also be used to predict which frames are unvoiced. The following commands will set the frequency of such frames to zero: +```python +is_voiced = crepe.predict_voicing(confidence) +frequency *= is_voiced +``` ## Argmax-local Weighted Averaging diff --git a/crepe/__init__.py b/crepe/__init__.py index 52eacea..f4d969c 100755 --- a/crepe/__init__.py +++ b/crepe/__init__.py @@ -1,2 +1,2 @@ from .version import version as __version__ -from .core import get_activation, predict, process_file +from .core import get_activation, predict, predict_voicing, process_file diff --git a/crepe/cli.py b/crepe/cli.py index aaa96de..e56fd06 100644 --- a/crepe/cli.py +++ b/crepe/cli.py @@ -10,7 +10,8 @@ def run(filename, output=None, model_capacity='full', viterbi=False, save_activation=False, save_plot=False, plot_voicing=False, - no_centering=False, step_size=10, verbose=True): + apply_voicing=False, no_centering=False, step_size=10, + verbose=True): """ Collect the WAV files to process and run the model @@ -36,6 +37,10 @@ def run(filename, output=None, model_capacity='full', viterbi=False, Include a visual representation of the voicing activity detection in the plot of the output activation matrix. False by default, only relevant if save_plot is True. + apply_voicing : bool + Apply viterbi algorithm to predict for every frame whether it was + voiced or unvoiced. Zero out silent frames and save the resulting + frequency array to a .npy file. no_centering : bool Don't pad the signal, meaning frames will begin at their timestamp instead of being centered around their timestamp (which is the @@ -81,6 +86,7 @@ def run(filename, output=None, model_capacity='full', viterbi=False, save_activation=save_activation, save_plot=save_plot, plot_voicing=plot_voicing, + apply_voicing=apply_voicing, step_size=step_size, verbose=verbose) @@ -143,6 +149,11 @@ def main(): parser.add_argument('--plot-voicing', '-v', action='store_true', help='Plot the voicing prediction on top of the ' 'output activation matrix plot') + parser.add_argument('--apply-voicing', '-P', action='store_true', + help='Apply viterbi algorithm to predict for every ' + 'frame whether it was voiced or unvoiced. Zero ' + 'out silent frames and save the resulting ' + 'frequency array to a .npy file.') parser.add_argument('--no-centering', '-n', action='store_true', help="Don't pad the signal, meaning frames will begin " "at their timestamp instead of being centered " @@ -168,6 +179,7 @@ def main(): save_activation=args.save_activation, save_plot=args.save_plot, plot_voicing=args.plot_voicing, + apply_voicing=args.apply_voicing, no_centering=args.no_centering, step_size=args.step_size, verbose=not args.quiet) diff --git a/crepe/core.py b/crepe/core.py index fc17c9c..b61f004 100644 --- a/crepe/core.py +++ b/crepe/core.py @@ -152,6 +152,45 @@ def to_viterbi_cents(salience): range(len(observations))]) +def predict_voicing(confidence): + """ + Find the Viterbi path for voiced versus unvoiced frames. + + Parameters + ---------- + confidence : np.ndarray [shape=(N,)] + voicing confidence array, i.e. the confidence in the presence of + a pitch + + Returns + ------- + voicing_states : np.ndarray [shape=(N,)] + HMM predictions for each frames state, 0 if unvoiced, 1 if + voiced + """ + from hmmlearn import hmm + + # uniform prior on the voicing confidence + starting = np.array([0.5, 0.5]) + + # transition probabilities inducing continuous voicing state + transition = np.array([[0.99, 0.01], [0.01, 0.99]]) + + # mean and variance for unvoiced and voiced states + means = np.array([[0.0], [1.0]]) + variances = np.array([[0.25], [0.25]]) + + # fix the model parameters because we are not optimizing the model + model = hmm.GaussianHMM(n_components=2) + model.startprob_, model.covars_, model.transmat_, model.means_, model.n_features = \ + starting, variances, transition, means, 1 + + # find the Viterbi path + voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)]) + + return np.array(voicing_states) + + def get_activation(audio, sr, model_capacity='full', center=True, step_size=10, verbose=1): """ @@ -271,7 +310,8 @@ def predict(audio, sr, model_capacity='full', def process_file(file, output=None, model_capacity='full', viterbi=False, center=True, save_activation=False, save_plot=False, - plot_voicing=False, step_size=10, verbose=True): + plot_voicing=False, apply_voicing=False, step_size=10, + verbose=True): """ Use the input model to perform pitch estimation on the input file. @@ -300,6 +340,10 @@ def process_file(file, output=None, model_capacity='full', viterbi=False, Include a visual representation of the voicing activity detection in the plot of the output activation matrix. False by default, only relevant if save_plot is True. + apply_voicing : bool + Apply viterbi algorithm to predict for every frame whether it was + voiced or unvoiced. Zero out silent frames and save the resulting + frequency array to a .npy file. step_size : int The step size in milliseconds for running pitch estimation. verbose : bool @@ -323,6 +367,18 @@ def process_file(file, output=None, model_capacity='full', viterbi=False, step_size=step_size, verbose=1 * verbose) + # predict voiced and unvoiced states, zero out silent frames, and + # save the resulting frequency array to a .npy file + if apply_voicing: + is_voiced = predict_voicing(confidence) + voiced_frequency = frequency * is_voiced + voiced_frequency_path = output_path(file, ".voiced_frequency.npy", + output) + np.save(voiced_frequency_path, voiced_frequency) + if verbose: + print("CREPE: Saved the voiced frequency array at {}".format( + voiced_frequency_path)) + # write prediction as TSV f0_file = output_path(file, ".f0.csv", output) f0_data = np.vstack([time, frequency, confidence]).transpose()