diff --git a/CHANGELOG.md b/CHANGELOG.md index 74091f22b..a8fba12ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ Clipping and speaker/source alignment issues in speech separation pipeline have - fix(separation): fix clipping issue in speech separation pipeline ([@joonaskalda](https://github.com/joonaskalda/)) - fix(separation): fix alignment between separated sources and diarization ([@Lebourdais](https://github.com/Lebourdais/) and [@clement-pages](https://github.com/clement-pages/)) +- fix(separation): prevent leakage removal collar from being applied to diarization ([@clement-pages](https://github.com/clement-pages/)) - fix(doc): fix link to pytorch ([@emmanuel-ferdman](https://github.com/emmanuel-ferdman/)) - fix(task): fix corner case with small (<9) number of validation samples ([@antoinelaurent](https://github.com/antoinelaurent/)) diff --git a/pyannote/audio/pipelines/speech_separation.py b/pyannote/audio/pipelines/speech_separation.py index 0ffe42a0d..28986e4e6 100644 --- a/pyannote/audio/pipelines/speech_separation.py +++ b/pyannote/audio/pipelines/speech_separation.py @@ -632,15 +632,16 @@ def apply( ) ) if asr_collar_frames > 0: + dilated_speaker_activations = np.zeros_like(discrete_diarization.data) for i in range(num_speakers): speaker_activation = discrete_diarization.data.T[i] non_silent = speaker_activation != 0 dilated_non_silent = binary_dilation(non_silent, [True] * (2 * asr_collar_frames)) - speaker_activation_with_context = dilated_non_silent.astype(np.int8) - discrete_diarization.data.T[i] = speaker_activation_with_context + dilated_speaker_activations.T[i] = dilated_non_silent.astype(np.int8) + dilated_speaker_activations = SlidingWindowFeature(dilated_speaker_activations, discrete_diarization.sliding_window) sources.data = ( - sources.data * discrete_diarization.align(sources).data + sources.data * dilated_speaker_activations.align(sources).data ) # separated sources might be scaled up/down due to SI-SDR loss used when training