From 36a1f0213411c2d251c2c18b2b1401171a6beebc Mon Sep 17 00:00:00 2001 From: Anupam Maurya Date: Tue, 23 Jan 2024 22:09:05 +0530 Subject: [PATCH 1/2] Update audio.py --- maha_tts/utils/audio.py | 70 ++++++++--------------------------------- 1 file changed, 13 insertions(+), 57 deletions(-) diff --git a/maha_tts/utils/audio.py b/maha_tts/utils/audio.py index 715b6b2..7029952 100644 --- a/maha_tts/utils/audio.py +++ b/maha_tts/utils/audio.py @@ -1,109 +1,65 @@ import torch import numpy as np -import librosa.util as librosa_util - from scipy.signal import get_window from scipy.io.wavfile import read -from maha_tts.config import config TACOTRON_MEL_MAX = 2.4 TACOTRON_MEL_MIN = -11.5130 -def denormalize_tacotron_mel(norm_mel): - return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN - - -def normalize_tacotron_mel(mel): - return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 +def normalize_and_denormalize_tacotron_mel(mel, denormalize=True): + if denormalize: + return ((mel + 1) / 2) * (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN) + TACOTRON_MEL_MIN + else: + return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 def get_mask_from_lengths(lengths, max_len=None): - if not max_len: - max_len = torch.max(lengths).item() + max_len = max_len or torch.max(lengths).item() ids = torch.arange(0, max_len, device=lengths.device, dtype=torch.long) mask = (ids < lengths.unsqueeze(1)).bool() return mask def get_mask(lengths, max_len=None): - if not max_len: - max_len = torch.max(lengths).item() - lens = torch.arange(max_len,) + max_len = max_len or torch.max(lengths).item() + lens = torch.arange(max_len) mask = lens[:max_len].unsqueeze(0) < lengths.unsqueeze(1) return mask - def dynamic_range_compression(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ return torch.log(torch.clamp(x, min=clip_val) * C) def dynamic_range_decompression(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ return torch.exp(x) / C def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None): - """ - # from librosa 0.6 - Compute the sum-square envelope of a window function at a given hop length. - This is used to estimate modulation effects induced by windowing - observations in short-time fourier transforms. - Parameters - ---------- - window : string, tuple, number, callable, or list-like - Window specification, as in `get_window` - n_frames : int > 0 - The number of analysis frames - hop_length : int > 0 - The number of samples to advance between frames - win_length : [optional] - The length of the window function. By default, this matches `n_fft`. - n_fft : int > 0 - The length of each analysis frame. - dtype : np.dtype - The data type of the output - Returns - ------- - wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` - The sum-squared envelope of the window function - """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) - # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) - win_sq = librosa_util.normalize(win_sq, norm=norm)**2 - win_sq = librosa_util.pad_center(win_sq, size=n_fft) + win_sq = np.square(np.pad(win_sq, (0, n_fft - len(win_sq)))) - # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x + def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path,) + sampling_rate, data = read(full_path) return torch.FloatTensor(data), sampling_rate - if __name__ == "__main__": lens = torch.tensor([2, 3, 7, 5, 4]) - mask = get_mask(lens) + mask = get_mask(lens) print(mask) - print(mask.shape) \ No newline at end of file + print(mask.shape) From 13bf1e496bab4f3a50228d41b53d53570bf51b95 Mon Sep 17 00:00:00 2001 From: Anupam Maurya Date: Tue, 6 Feb 2024 09:31:47 +0530 Subject: [PATCH 2/2] Update audio.py --- maha_tts/utils/audio.py | 81 ++++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/maha_tts/utils/audio.py b/maha_tts/utils/audio.py index 7029952..000c4ad 100644 --- a/maha_tts/utils/audio.py +++ b/maha_tts/utils/audio.py @@ -6,58 +6,105 @@ TACOTRON_MEL_MAX = 2.4 TACOTRON_MEL_MIN = -11.5130 +def denormalize_tacotron_mel(norm_mel): + return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN -def normalize_and_denormalize_tacotron_mel(mel, denormalize=True): - if denormalize: - return ((mel + 1) / 2) * (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN) + TACOTRON_MEL_MIN - else: - return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 - -def get_mask_from_lengths(lengths, max_len=None): - max_len = max_len or torch.max(lengths).item() - ids = torch.arange(0, max_len, device=lengths.device, dtype=torch.long) - mask = (ids < lengths.unsqueeze(1)).bool() - return mask +def normalize_tacotron_mel(mel): + return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 def get_mask(lengths, max_len=None): + """ + Generate a mask for sequences based on lengths. + + Parameters: + - lengths: Torch tensor, lengths of sequences + - max_len: Optional, maximum length for padding + + Returns: + - Torch tensor, mask for sequences + """ max_len = max_len or torch.max(lengths).item() lens = torch.arange(max_len) mask = lens[:max_len].unsqueeze(0) < lengths.unsqueeze(1) return mask - def dynamic_range_compression(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) + """ + Perform dynamic range compression on input tensor. + + Parameters: + - x: Torch tensor, input tensor + - C: Compression factor + - clip_val: Minimum value to clamp input tensor + Returns: + - Torch tensor, compressed tensor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) def dynamic_range_decompression(x, C=1): - return torch.exp(x) / C + """ + Perform dynamic range decompression on input tensor. + Parameters: + - x: Torch tensor, input tensor + - C: Compression factor used for compression + + Returns: + - Torch tensor, decompressed tensor + """ + return torch.exp(x) / C def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None): + """ + Compute the sum-square envelope of a window function at a given hop length. + + Parameters: + - window: String, tuple, number, callable, or list-like; window specification + - n_frames: Int, number of analysis frames + - hop_length: Int, number of samples to advance between frames + - win_length: Int, length of the window function + - n_fft: Int, length of each analysis frame + - dtype: Numpy data type of the output + - norm: Normalization type for the window function + + Returns: + - Numpy array, sum-squared envelope of the window function + """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) + # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) - win_sq = np.square(np.pad(win_sq, (0, n_fft - len(win_sq)))) + win_sq = np.square(librosa.util.normalize(win_sq, norm=norm)) + win_sq = librosa.util.pad_center(win_sq, size=n_fft) + # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x - def load_wav_to_torch(full_path): + """ + Load WAV file into Torch tensor. + + Parameters: + - full_path: String, path to the WAV file + + Returns: + - Torch tensor, audio data + - Int, sampling rate + """ sampling_rate, data = read(full_path) return torch.FloatTensor(data), sampling_rate - if __name__ == "__main__": lens = torch.tensor([2, 3, 7, 5, 4]) mask = get_mask(lens)