marl · sannawag · Aug 30, 2018 · Aug 30, 2018 · Aug 30, 2018 · Aug 30, 2018
diff --git a/README.md b/README.md
@@ -125,6 +125,11 @@ from scipy.io import wavfile
 sr, audio = wavfile.read('/path/to/audiofile.wav')
 time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True)
 ```
+The Viterbi algorithm can also be used to predict which frames are unvoiced. The following commands will set the frequency of such frames to zero:
+```python
+is_voiced = crepe.predict_voicing(confidence)
+frequency *= is_voiced
+```
 
 ## Argmax-local Weighted Averaging
 

diff --git a/crepe/__init__.py b/crepe/__init__.py
@@ -1,2 +1,2 @@
 from .version import version as __version__
-from .core import get_activation, predict, process_file
+from .core import get_activation, predict, predict_voicing, process_file
diff --git a/crepe/cli.py b/crepe/cli.py
@@ -10,7 +10,8 @@
 
 def run(filename, output=None, model_capacity='full', viterbi=False,
         save_activation=False, save_plot=False, plot_voicing=False,
-        no_centering=False, step_size=10, verbose=True):
+        apply_voicing=False, no_centering=False, step_size=10,
+        verbose=True):
     """
     Collect the WAV files to process and run the model
 
@@ -36,6 +37,10 @@ def run(filename, output=None, model_capacity='full', viterbi=False,
         Include a visual representation of the voicing activity detection in
         the plot of the output activation matrix. False by default, only
         relevant if save_plot is True.
+    apply_voicing : bool
+        Apply viterbi algorithm to predict for every frame whether it was
+        voiced or unvoiced. Zero out silent frames and save the resulting
+        frequency array to a .npy file.
     no_centering : bool
         Don't pad the signal, meaning frames will begin at their timestamp
         instead of being centered around their timestamp (which is the
@@ -81,6 +86,7 @@ def run(filename, output=None, model_capacity='full', viterbi=False,
                      save_activation=save_activation,
                      save_plot=save_plot,
                      plot_voicing=plot_voicing,
+                     apply_voicing=apply_voicing,
                      step_size=step_size,
                      verbose=verbose)
 
@@ -143,6 +149,11 @@ def main():
     parser.add_argument('--plot-voicing', '-v', action='store_true',
                         help='Plot the voicing prediction on top of the '
                              'output activation matrix plot')
+    parser.add_argument('--apply-voicing', '-P', action='store_true',
+                        help='Apply viterbi algorithm to predict for every '
+                             'frame whether it was voiced or unvoiced. Zero '
+                             'out silent frames and save the resulting '
+                             'frequency array to a .npy file.')
     parser.add_argument('--no-centering', '-n', action='store_true',
                         help="Don't pad the signal, meaning frames will begin "
                              "at their timestamp instead of being centered "
@@ -168,6 +179,7 @@ def main():
         save_activation=args.save_activation,
         save_plot=args.save_plot,
         plot_voicing=args.plot_voicing,
+        apply_voicing=args.apply_voicing,
         no_centering=args.no_centering,
         step_size=args.step_size,
         verbose=not args.quiet)
diff --git a/crepe/core.py b/crepe/core.py
@@ -152,6 +152,45 @@ def to_viterbi_cents(salience):
                      range(len(observations))])
 
 
+def predict_voicing(confidence):
+    """
+    Find the Viterbi path for voiced versus unvoiced frames.
+
+    Parameters
+    ----------
+    confidence : np.ndarray [shape=(N,)]
+        voicing confidence array, i.e. the confidence in the presence of
+        a pitch
+
+    Returns
+    -------
+    voicing_states : np.ndarray [shape=(N,)]
+        HMM predictions for each frames state, 0 if unvoiced, 1 if
+        voiced
+    """
+    from hmmlearn import hmm
+
+    # uniform prior on the voicing confidence
+    starting = np.array([0.5, 0.5])
+
+    # transition probabilities inducing continuous voicing state
+    transition = np.array([[0.99, 0.01], [0.01, 0.99]])
+
+    # mean and variance for unvoiced and voiced states
+    means = np.array([[0.0], [1.0]])
+    variances = np.array([[0.25], [0.25]])
+
+    # fix the model parameters because we are not optimizing the model
+    model = hmm.GaussianHMM(n_components=2)
+    model.startprob_, model.covars_, model.transmat_, model.means_, model.n_features = \
+        starting, variances, transition, means, 1
+
+    # find the Viterbi path
+    voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)])
+
+    return np.array(voicing_states)
+
+
 def get_activation(audio, sr, model_capacity='full', center=True, step_size=10,
                    verbose=1):
     """
@@ -271,7 +310,8 @@ def predict(audio, sr, model_capacity='full',
 
 def process_file(file, output=None, model_capacity='full', viterbi=False,
                  center=True, save_activation=False, save_plot=False,
-                 plot_voicing=False, step_size=10, verbose=True):
+                 plot_voicing=False, apply_voicing=False, step_size=10,
+                 verbose=True):
     """
     Use the input model to perform pitch estimation on the input file.
 
@@ -300,6 +340,10 @@ def process_file(file, output=None, model_capacity='full', viterbi=False,
         Include a visual representation of the voicing activity detection in
         the plot of the output activation matrix. False by default, only
         relevant if save_plot is True.
+    apply_voicing : bool
+        Apply viterbi algorithm to predict for every frame whether it was
+        voiced or unvoiced. Zero out silent frames and save the resulting
+        frequency array to a .npy file.
     step_size : int
         The step size in milliseconds for running pitch estimation.
     verbose : bool
@@ -323,6 +367,18 @@ def process_file(file, output=None, model_capacity='full', viterbi=False,
         step_size=step_size,
         verbose=1 * verbose)
 
+    # predict voiced and unvoiced states, zero out silent frames, and
+    # save the resulting frequency array to a .npy file
+    if apply_voicing:
+        is_voiced = predict_voicing(confidence)
+        voiced_frequency = frequency * is_voiced
+        voiced_frequency_path = output_path(file, ".voiced_frequency.npy",
+                                            output)
+        np.save(voiced_frequency_path, voiced_frequency)
+        if verbose:
+            print("CREPE: Saved the voiced frequency array at {}".format(
+                voiced_frequency_path))
+
     # write prediction as TSV
     f0_file = output_path(file, ".f0.csv", output)
     f0_data = np.vstack([time, frequency, confidence]).transpose()