From 92cd38448682120fe1b9650acf7f33dcaf10bbfd Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sat, 1 Jul 2023 12:06:14 +0900 Subject: [PATCH 1/3] draft commit --- server/voice_changer/RVC/RVC.py | 7 +- .../RVC/inferencer/RVCInferencerv2.py | 4 +- server/voice_changer/RVC/inferencer/models.py | 8 +- .../rvc_models/infer_pack/models.py | 124 ++++++++++++++++-- server/voice_changer/RVC/pipeline/Pipeline.py | 3 +- 5 files changed, 130 insertions(+), 16 deletions(-) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 884663d5a..f520e26f1 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -109,6 +109,7 @@ def generate_input( if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (128 - (convertSize % 128)) + outSize = convertSize - self.settings.extraConvertSize # バッファがたまっていない場合はzeroで補う if self.audio_buffer.shape[0] < convertSize: @@ -132,15 +133,16 @@ def generate_input( vol = max(vol, self.prevVol * 0.0) self.prevVol = vol - return (audio_buffer, convertSize, vol) + return (audio_buffer, convertSize, vol, outSize) def inference(self, data): audio = data[0] convertSize = data[1] vol = data[2] + outSize = data[3] if vol < self.settings.silentThreshold: - return np.zeros(convertSize).astype(np.int16) + return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99) repeat = 1 if self.settings.rvcQuality else 0 @@ -165,6 +167,7 @@ def inference(self, data): useFinalProj, repeat, protect, + outSize ) result = audio_out.detach().cpu().numpy() * np.sqrt(vol) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py index 23007a688..d9fa7e77e 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py @@ -3,6 +3,7 @@ from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.inferencer.Inferencer import Inferencer from .rvc_models.infer_pack.models import SynthesizerTrnMs768NSFsid +from typing import Optional class RVCInferencerv2(Inferencer): @@ -32,5 +33,6 @@ def infer( pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + out_length: Optional[int] = None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid) + return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=out_length) diff --git a/server/voice_changer/RVC/inferencer/models.py b/server/voice_changer/RVC/inferencer/models.py index 400fe70ba..7b80bb74f 100644 --- a/server/voice_changer/RVC/inferencer/models.py +++ b/server/voice_changer/RVC/inferencer/models.py @@ -129,12 +129,12 @@ def forward(self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds): # 这 o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) @@ -208,10 +208,10 @@ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[b o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, max_len=None): + def infer(self, phone, phone_lengths, sid, max_len=None, out_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=out_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py index de38ae4c5..fc7814d3f 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py @@ -232,6 +232,25 @@ def __init__( if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + # Compute the minimum size required for estimating real-time speech conversion. + self.realtime = False + if resblock != "1": + self.realtime = True + self.ups_size = [0 for _ in range(len(self.ups))] + # conv_post + self.ups_size[-1] += 3 + + for i in range(len(self.ups)-1, -1, -1): + for k, d in zip(resblock_kernel_sizes[::-1], resblock_dilation_sizes[::-1]): + # conv2 + self.ups_size[i] += (k - 1)//2 + # conv1 + self.ups_size[i] += d * (k - 1)//2 + # upsampling + self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2 + if i: + self.ups_size[i-1] = self.ups_size[i] + 0 + def forward(self, x, g=None): x = self.conv_pre(x) if g is not None: @@ -253,6 +272,35 @@ def forward(self, x, g=None): return x + def infer_realtime(self, x, g=None, convert_length=None): + out_length = x.shape[2] * np.prod(self.upsample_rates) + if convert_length is None: + convert_length = x.shape[2] * np.prod(self.upsample_rates) + + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + if self.realtime: + x = x[:, :, -self.ups_size[i] + (-convert_length // np.prod(self.upsample_rates[i:])):] + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + out = torch.zeros([x.shape[0], 1, x.shape[0] * np.prod(self.upsample_rates)], device=x.device, dtype=x.dtype) + out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:] + return out + def remove_weight_norm(self): for l in self.ups: remove_weight_norm(l) @@ -404,6 +452,7 @@ def __init__( super(GeneratorNSF, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) + self.upsample_rates = upsample_rates self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.m_source = SourceModuleHnNSF(sampling_rate=sr, harmonic_num=0, is_half=is_half) @@ -453,6 +502,29 @@ def __init__( self.upp = np.prod(upsample_rates) + # Compute the minimum size required for estimating real-time speech conversion. + self.realtime = False + if resblock != "1": + self.realtime = True + self.ups_size = [0 for _ in range(len(self.ups))] + self.noise_conv_size = [0 for _ in range(len(self.ups))] + # conv_post + self.ups_size[-1] += 3 + + for i in range(len(self.ups)-1, -1, -1): + for k, d in zip(resblock_kernel_sizes[::-1], resblock_dilation_sizes[::-1]): + # conv2 + self.ups_size[i] += (k - 1)//2 + # conv1 + self.ups_size[i] += d[-1] * (k - 1)//2 + # noise_conv + self.noise_conv_size[i] = self.ups_size[i] * np.prod(upsample_rates[i:]) + # upsampling + + self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2 + if i: + self.ups_size[i-1] = self.ups_size[i] + 0 + def forward(self, x, f0, g=None): har_source, noi_source, uv = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) @@ -477,6 +549,42 @@ def forward(self, x, f0, g=None): x = torch.tanh(x) return x + def infer_realtime(self, x, f0, g=None, convert_length=None): + out_length = x.shape[2] * np.prod(self.upsample_rates) + if convert_length is None: + convert_length = x.shape[2] * np.prod(self.upsample_rates) + + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + if self.realtime: + x = x[:, :, -self.ups_size[i] + (-convert_length // np.prod(self.upsample_rates[i:])):] + x = F.leaky_relu(x, LRELU_SLOPE) + x_ = self.ups[i](x) + x_source = self.noise_convs[i](har_source[:, :, -convert_length - self.noise_conv_size[i]:]) + x = torch.zeros([x_.shape[0], x_.shape[1], max(x_.shape[2], x_source.shape[2])], device=x.device, dtype=x.dtype) + x[:, :, -x_.shape[2]:] += x_ + x[:, :, -x_source.shape[2]:] += x_source + + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + out = torch.zeros([x.shape[0], 1, out_length], device=x.device, dtype=x.dtype) + out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:] + + return x #out + def remove_weight_norm(self): for l in self.ups: remove_weight_norm(l) @@ -566,12 +674,12 @@ def forward(self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds): # 这 o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) @@ -650,12 +758,12 @@ def forward(self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds): # 这 o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) @@ -727,12 +835,12 @@ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[b o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, max_len=None): + def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) @@ -804,12 +912,12 @@ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[b o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, max_len=None): + def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 4e24acb10..6bf789157 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -79,6 +79,7 @@ def exec( useFinalProj, repeat, protect=0.5, + out_size=None, ): # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 @@ -206,7 +207,7 @@ def exec( with autocast(enabled=self.isHalf): audio1 = ( torch.clip( - self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), + self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32), -1.0, 1.0, ) From 5a5f7feefd0f62ef94166e6bd00ddab3a7b655b0 Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sat, 1 Jul 2023 16:45:25 +0900 Subject: [PATCH 2/3] =?UTF-8?q?inference=E3=81=AE=E9=AB=98=E5=93=81?= =?UTF-8?q?=E8=B3=AA=E5=8C=96+=E9=AB=98=E9=80=9F=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/voice_changer/RVC/RVC.py | 52 +++++++++++++------ .../RVC/inferencer/OnnxRVCInferencer.py | 5 +- .../RVC/inferencer/OnnxRVCInferencerNono.py | 2 +- .../RVC/inferencer/RVCInferencer.py | 3 +- .../RVC/inferencer/RVCInferencerNono.py | 3 +- .../RVC/inferencer/RVCInferencerv2.py | 5 +- .../RVC/inferencer/RVCInferencerv2Nono.py | 3 +- .../RVC/inferencer/VorasInferencebeta.py | 1 + .../RVC/inferencer/WebUIInferencer.py | 3 +- .../RVC/inferencer/WebUIInferencerNono.py | 3 +- server/voice_changer/RVC/inferencer/models.py | 5 +- .../rvc_models/infer_pack/models.py | 5 +- .../SynthesizerTrnMs256NSFsid_ONNX.py | 6 ++- .../SynthesizerTrnMs256NSFsid_nono_ONNX.py | 6 +-- .../SynthesizerTrnMs768NSFsid_ONNX.py | 4 +- .../SynthesizerTrnMs768NSFsid_nono_ONNX.py | 4 +- .../SynthesizerTrnMsNSFsidNono_webui_ONNX.py | 4 +- .../SynthesizerTrnMsNSFsid_webui_ONNX.py | 5 +- server/voice_changer/RVC/pipeline/Pipeline.py | 43 ++++++++++----- .../RVC/pitchExtractor/CrepePitchExtractor.py | 13 ++--- .../RVC/pitchExtractor/DioPitchExtractor.py | 22 ++++---- .../pitchExtractor/HarvestPitchExtractor.py | 19 +++---- .../voice_changer/utils/VoiceChangerModel.py | 3 ++ 23 files changed, 134 insertions(+), 85 deletions(-) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index f520e26f1..c46bae297 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -23,7 +23,7 @@ from voice_changer.RVC.RVCSettings import RVCSettings from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager -from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel +from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.RVC.onnxExporter.export2onnx import export2onnx from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager @@ -46,6 +46,8 @@ def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot): self.pipeline: Pipeline | None = None self.audio_buffer: AudioInOut | None = None + self.pitchf_buffer: PitchfInOut | None = None + self.feature_buffer: FeatureInOut | None = None self.prevVol = 0.0 self.slotInfo = slotInfo self.initialize() @@ -99,11 +101,18 @@ def generate_input( ): newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1) + new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate if self.audio_buffer is not None: # 過去のデータに連結 self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) + if self.slotInfo.f0: + self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0) + self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0) else: self.audio_buffer = newData + if self.slotInfo.f0: + self.pitchf_buffer = np.zeros(new_feature_length) + self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels]) convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize @@ -114,36 +123,43 @@ def generate_input( # バッファがたまっていない場合はzeroで補う if self.audio_buffer.shape[0] < convertSize: self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) + if self.slotInfo.f0: + self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer]) + self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer]) convertOffset = -1 * convertSize + featureOffset = -convertSize * 100 // self.slotInfo.samplingRate self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 - - if self.pipeline is not None: - device = self.pipeline.device - else: - device = torch.device("cpu") - - audio_buffer = torch.from_numpy(self.audio_buffer).to(device=device, dtype=torch.float32) - + if self.slotInfo.f0: + self.pitchf_buffer = self.pitchf_buffer[featureOffset:] + self.feature_buffer = self.feature_buffer[featureOffset:] + # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) cropOffset = -1 * (inputSize + crossfadeSize) cropEnd = -1 * (crossfadeSize) - crop = audio_buffer[cropOffset:cropEnd] - vol = torch.sqrt(torch.square(crop).mean()).detach().cpu().numpy() + crop = self.audio_buffer[cropOffset:cropEnd] + vol = np.sqrt(np.square(crop).mean()) vol = max(vol, self.prevVol * 0.0) self.prevVol = vol - return (audio_buffer, convertSize, vol, outSize) + return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize) def inference(self, data): audio = data[0] - convertSize = data[1] - vol = data[2] - outSize = data[3] + pitchf = data[1] + feature = data[2] + convertSize = data[3] + vol = data[4] + outSize = data[5] if vol < self.settings.silentThreshold: return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) + if self.pipeline is not None: + device = self.pipeline.device + else: + device = torch.device("cpu") + audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32) audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99) repeat = 1 if self.settings.rvcQuality else 0 sid = 0 @@ -156,13 +172,15 @@ def inference(self, data): useFinalProj = self.slotInfo.useFinalProj try: - audio_out = self.pipeline.exec( + audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( sid, audio, + pitchf, + feature, f0_up_key, index_rate, if_f0, - self.settings.extraConvertSize / self.slotInfo.samplingRate, # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。 + self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。 embOutputLayer, useFinalProj, repeat, diff --git a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py index e9bfeec75..9d1c62a11 100644 --- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py +++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py @@ -35,6 +35,7 @@ def infer( pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: if pitch is None or pitchf is None: raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.") @@ -50,7 +51,7 @@ def infer( "p_len": pitch_length.cpu().numpy().astype(np.int64), "pitch": pitch.cpu().numpy().astype(np.int64), "pitchf": pitchf.cpu().numpy().astype(np.float32), - "sid": sid.cpu().numpy().astype(np.int64), + "sid": sid.cpu().numpy().astype(np.int64) }, ) else: @@ -61,7 +62,7 @@ def infer( "p_len": pitch_length.cpu().numpy().astype(np.int64), "pitch": pitch.cpu().numpy().astype(np.int64), "pitchf": pitchf.cpu().numpy().astype(np.float32), - "sid": sid.cpu().numpy().astype(np.int64), + "sid": sid.cpu().numpy().astype(np.int64) }, ) diff --git a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py index 7c6f81366..d85d42923 100644 --- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py @@ -4,7 +4,6 @@ from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer - class OnnxRVCInferencerNono(OnnxRVCInferencer): def loadModel(self, file: str, gpu: int): super().loadModel(file, gpu) @@ -18,6 +17,7 @@ def infer( pitch: torch.Tensor | None, pitchf: torch.Tensor | None, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: if self.isHalf: audio1 = self.model.run( diff --git a/server/voice_changer/RVC/inferencer/RVCInferencer.py b/server/voice_changer/RVC/inferencer/RVCInferencer.py index fb6368fed..9539d77cc 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencer.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencer.py @@ -33,5 +33,6 @@ def infer( pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid) + return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py index a796fcbd4..30355b9ab 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py @@ -33,5 +33,6 @@ def infer( pitch: torch.Tensor | None, pitchf: torch.Tensor | None, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid) + return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py index d9fa7e77e..31fbe4844 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py @@ -3,7 +3,6 @@ from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.inferencer.Inferencer import Inferencer from .rvc_models.infer_pack.models import SynthesizerTrnMs768NSFsid -from typing import Optional class RVCInferencerv2(Inferencer): @@ -33,6 +32,6 @@ def infer( pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, - out_length: Optional[int] = None, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=out_length) + return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py b/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py index 9aac3260d..7b85dc969 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py @@ -33,5 +33,6 @@ def infer( pitch: torch.Tensor | None, pitchf: torch.Tensor | None, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid) + return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/VorasInferencebeta.py b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py index a5b02f40f..e7b77cceb 100644 --- a/server/voice_changer/RVC/inferencer/VorasInferencebeta.py +++ b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py @@ -35,5 +35,6 @@ def infer( pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: return self.model.infer(feats, pitch_length, pitch, pitchf, sid) diff --git a/server/voice_changer/RVC/inferencer/WebUIInferencer.py b/server/voice_changer/RVC/inferencer/WebUIInferencer.py index eb3d442a7..d884f4c70 100644 --- a/server/voice_changer/RVC/inferencer/WebUIInferencer.py +++ b/server/voice_changer/RVC/inferencer/WebUIInferencer.py @@ -33,5 +33,6 @@ def infer( pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid) + return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py index f64484438..7bc54f140 100644 --- a/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py @@ -33,5 +33,6 @@ def infer( pitch: torch.Tensor | None, pitchf: torch.Tensor | None, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid) + return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/models.py b/server/voice_changer/RVC/inferencer/models.py index 7b80bb74f..ac14a6711 100644 --- a/server/voice_changer/RVC/inferencer/models.py +++ b/server/voice_changer/RVC/inferencer/models.py @@ -138,6 +138,7 @@ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_l return o, x_mask, (z, z_p, m_p, logs_p) + class SynthesizerTrnMsNSFsidNono(nn.Module): def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, emb_channels, sr=None, **kwargs): super().__init__() @@ -208,10 +209,10 @@ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[b o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, max_len=None, out_length=None): + def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=out_length) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py index fc7814d3f..abd2dbb84 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py @@ -203,6 +203,7 @@ def __init__( super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) + self.upsample_rates = upsample_rates self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) resblock = ResBlock1 if resblock == "1" else ResBlock2 @@ -245,7 +246,7 @@ def __init__( # conv2 self.ups_size[i] += (k - 1)//2 # conv1 - self.ups_size[i] += d * (k - 1)//2 + self.ups_size[i] += d[-1] * (k - 1)//2 # upsampling self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2 if i: @@ -297,7 +298,7 @@ def infer_realtime(self, x, g=None, convert_length=None): x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) - out = torch.zeros([x.shape[0], 1, x.shape[0] * np.prod(self.upsample_rates)], device=x.device, dtype=x.dtype) + out = torch.zeros([x.shape[0], 1, out_length], device=x.device, dtype=x.dtype) out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:] return out diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_ONNX.py index c3c6121f7..827a955c6 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_ONNX.py @@ -58,10 +58,12 @@ def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) + + diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_nono_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_nono_ONNX.py index d200478f7..db216d456 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_nono_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_nono_ONNX.py @@ -57,10 +57,10 @@ def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, sid, max_len=None): + def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) - return o, x_mask, (z, z_p, m_p, logs_p) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) + return o, x_mask, (z, z_p, m_p, logs_p) \ No newline at end of file diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py index ca1d30966..a7193febe 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py @@ -59,10 +59,10 @@ def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_nono_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_nono_ONNX.py index 1971c3a3d..d6d8365ee 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_nono_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_nono_ONNX.py @@ -81,10 +81,10 @@ def __init__( print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, sid, max_len=None): + def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsidNono_webui_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsidNono_webui_ONNX.py index 2be4d28b8..36ad2ebe4 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsidNono_webui_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsidNono_webui_ONNX.py @@ -60,10 +60,10 @@ def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, sid, max_len=None): + def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsid_webui_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsid_webui_ONNX.py index 61b719013..421f3ddd1 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsid_webui_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsid_webui_ONNX.py @@ -61,10 +61,11 @@ def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) + diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 6bf789157..4c321f268 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -13,6 +13,9 @@ from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.inferencer.Inferencer import Inferencer +from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer +from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono + from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor @@ -70,7 +73,9 @@ def setPitchExtractor(self, pitchExtractor: PitchExtractor): def exec( self, sid, - audio, + audio, # torch.tensor [n] + pitchf, # np.array [m] + feature, # np.array [m, feat] f0_up_key, index_rate, if_f0, @@ -98,13 +103,14 @@ def exec( # RVC QualityがOnのときにはsilence_frontをオフに。 silence_front = silence_front if repeat == 0 else 0 + pitchf = pitchf if repeat == 0 else torch.zeros([pitchf.shape[0], pitchf.shape[1] * 2]) # ピッチ検出 - pitch, pitchf = None, None try: if if_f0 == 1: pitch, pitchf = self.pitchExtractor.extract( audio_pad, + pitchf, f0_up_key, self.sr, self.window, @@ -114,6 +120,9 @@ def exec( pitchf = pitchf[:p_len] pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0) + else: + pitch = None + pitchf = None except IndexError: # print(e) raise NotEnoughDataExtimateF0() @@ -165,9 +174,8 @@ def exec( npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) # recover silient font - npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]).astype("float32"), npy]) + npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:] feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) if protect < 0.5 and search_index: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) @@ -192,14 +200,21 @@ def exec( feats = feats.to(feats0.dtype) p_len = torch.tensor([p_len], device=self.device).long() + feats_buffer = feats.squeeze(0).detach().cpu() + if pitchf is not None: + pitchf_buffer = pitchf.squeeze(0).detach().cpu() + else: + pitchf_buffer = None # apply silent front for inference - npyOffset = math.floor(silence_front * 16000) // 360 - feats = feats[:, npyOffset * 2 :, :] - feats_len = feats.shape[1] - if pitch is not None and pitchf is not None: - pitch = pitch[:, -feats_len:] - pitchf = pitchf[:, -feats_len:] - p_len = torch.tensor([feats_len], device=self.device).long() + if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]: + npyOffset = math.floor(silence_front * 16000) // 360 + feats = feats[:, npyOffset * 2 :, :] + feats_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, -feats_len:] + pitchf = pitchf[:, -feats_len:] + p_len = torch.tensor([feats_len], device=self.device).long() + # 推論実行 try: @@ -220,7 +235,7 @@ def exec( else: raise e - del feats, p_len, padding_mask + del p_len, padding_mask, pitch, pitchf, feats torch.cuda.empty_cache() # inferで出力されるサンプリングレートはモデルのサンプリングレートになる。 @@ -230,6 +245,6 @@ def exec( end = -1 * self.t_pad_tgt audio1 = audio1[offset:end] - del pitch, pitchf, sid + del sid torch.cuda.empty_cache() - return audio1 + return audio1, pitchf_buffer, feats_buffer diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py index 2f5104111..39a30e3b4 100644 --- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py @@ -16,7 +16,7 @@ def __init__(self): else: self.device = torch.device("cpu") - def extract(self, audio, f0_up_key, sr, window, silence_front=0): + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) real_silence_front = start_frame * window / sr @@ -52,11 +52,12 @@ def extract(self, audio, f0_up_key, sr, window, silence_front=0): ) f0 *= pow(2, f0_up_key / 12) - f0bak = f0.detach().cpu().numpy() - f0_mel = 1127.0 * torch.log(1.0 + f0 / 700.0) - f0_mel = torch.clip( + pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 ) - f0_coarse = f0_mel.round().detach().cpu().numpy().astype(int) + pitch_coarse = f0_mel.astype(int) - return f0_coarse, f0bak + return pitch_coarse, pitchf diff --git a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py index 4ef62203c..b6a520e34 100644 --- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py @@ -8,7 +8,7 @@ class DioPitchExtractor(PitchExtractor): pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.dio - def extract(self, audio, f0_up_key, sr, window, silence_front=0): + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): audio = audio.detach().cpu().numpy() n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) @@ -34,13 +34,13 @@ def extract(self, audio, f0_up_key, sr, window, silence_front=0): f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) f0 *= pow(2, f0_up_key / 12) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(int) - - return f0_coarse, f0bak + pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( + (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 + ) + pitch_coarse = f0_mel.astype(int) + + return pitch_coarse, pitchf + diff --git a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py index 9e166b7ce..11c27b168 100644 --- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py @@ -9,7 +9,7 @@ class HarvestPitchExtractor(PitchExtractor): pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest - def extract(self, audio, f0_up_key, sr, window, silence_front=0): + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): audio = audio.detach().cpu().numpy() n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) @@ -35,13 +35,14 @@ def extract(self, audio, f0_up_key, sr, window, silence_front=0): f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) f0 *= pow(2, f0_up_key / 12) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(int) + pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( + (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 + ) + pitch_coarse = f0_mel.astype(int) + + return pitch_coarse, pitchf return f0_coarse, f0bak diff --git a/server/voice_changer/utils/VoiceChangerModel.py b/server/voice_changer/utils/VoiceChangerModel.py index 7dd4fda98..e28690ac5 100644 --- a/server/voice_changer/utils/VoiceChangerModel.py +++ b/server/voice_changer/utils/VoiceChangerModel.py @@ -5,6 +5,9 @@ AudioInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] +PitchfInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] +FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] + class VoiceChangerModel(Protocol): From 7b251339f173cd0d47d007a4074f45e5fed9793a Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sat, 1 Jul 2023 16:56:06 +0900 Subject: [PATCH 3/3] fix pitch --- .../RVC/pitchExtractor/DioPitchExtractor.py | 14 ++++++++------ .../RVC/pitchExtractor/HarvestPitchExtractor.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py index b6a520e34..65aa2ef70 100644 --- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py @@ -31,16 +31,18 @@ def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): frame_period=10, ) f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr) - f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) + # f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) f0 *= pow(2, f0_up_key / 12) pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] f0bak = pitchf.copy() - f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) - f0_mel = np.clip( - (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 - ) - pitch_coarse = f0_mel.astype(int) + f0_mel = 1127 * np.log(1 + f0bak / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch_coarse = np.rint(f0_mel).astype(int) return pitch_coarse, pitchf diff --git a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py index 11c27b168..53c3f8933 100644 --- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py @@ -32,17 +32,17 @@ def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr) f0 = signal.medfilt(f0, 3) - f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) + # f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) f0 *= pow(2, f0_up_key / 12) pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] f0bak = pitchf.copy() - f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) - f0_mel = np.clip( - (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 - ) - pitch_coarse = f0_mel.astype(int) + f0_mel = 1127 * np.log(1 + f0bak / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch_coarse = np.rint(f0_mel).astype(int) return pitch_coarse, pitchf - - return f0_coarse, f0bak