Skip to content

Commit

Permalink
Merge pull request w-okada#371 from nadare881/RVC_improve
Browse files Browse the repository at this point in the history
リアルタイム特化推論の追加
  • Loading branch information
w-okada authored Jul 4, 2023
2 parents 3742642 + 7b25133 commit fa07de8
Show file tree
Hide file tree
Showing 23 changed files with 249 additions and 84 deletions.
55 changes: 38 additions & 17 deletions server/voice_changer/RVC/RVC.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

from voice_changer.RVC.RVCSettings import RVCSettings
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
Expand All @@ -46,6 +46,8 @@ def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot):
self.pipeline: Pipeline | None = None

self.audio_buffer: AudioInOut | None = None
self.pitchf_buffer: PitchfInOut | None = None
self.feature_buffer: FeatureInOut | None = None
self.prevVol = 0.0
self.slotInfo = slotInfo
self.initialize()
Expand Down Expand Up @@ -99,49 +101,65 @@ def generate_input(
):
newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)

new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate
if self.audio_buffer is not None:
# 過去のデータに連結
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
else:
self.audio_buffer = newData
if self.slotInfo.f0:
self.pitchf_buffer = np.zeros(new_feature_length)
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])

convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize

if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
outSize = convertSize - self.settings.extraConvertSize

# バッファがたまっていない場合はzeroで補う
if self.audio_buffer.shape[0] < convertSize:
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])

convertOffset = -1 * convertSize
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出

if self.pipeline is not None:
device = self.pipeline.device
else:
device = torch.device("cpu")

audio_buffer = torch.from_numpy(self.audio_buffer).to(device=device, dtype=torch.float32)

if self.slotInfo.f0:
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
self.feature_buffer = self.feature_buffer[featureOffset:]

# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
cropOffset = -1 * (inputSize + crossfadeSize)
cropEnd = -1 * (crossfadeSize)
crop = audio_buffer[cropOffset:cropEnd]
vol = torch.sqrt(torch.square(crop).mean()).detach().cpu().numpy()
crop = self.audio_buffer[cropOffset:cropEnd]
vol = np.sqrt(np.square(crop).mean())
vol = max(vol, self.prevVol * 0.0)
self.prevVol = vol

return (audio_buffer, convertSize, vol)
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize)

def inference(self, data):
audio = data[0]
convertSize = data[1]
vol = data[2]
pitchf = data[1]
feature = data[2]
convertSize = data[3]
vol = data[4]
outSize = data[5]

if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)

if self.pipeline is not None:
device = self.pipeline.device
else:
device = torch.device("cpu")
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
repeat = 1 if self.settings.rvcQuality else 0
sid = 0
Expand All @@ -154,17 +172,20 @@ def inference(self, data):
useFinalProj = self.slotInfo.useFinalProj

try:
audio_out = self.pipeline.exec(
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
sid,
audio,
pitchf,
feature,
f0_up_key,
index_rate,
if_f0,
self.settings.extraConvertSize / self.slotInfo.samplingRate, # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
embOutputLayer,
useFinalProj,
repeat,
protect,
outSize
)
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)

Expand Down
5 changes: 3 additions & 2 deletions server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def infer(
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
if pitch is None or pitchf is None:
raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.")
Expand All @@ -50,7 +51,7 @@ def infer(
"p_len": pitch_length.cpu().numpy().astype(np.int64),
"pitch": pitch.cpu().numpy().astype(np.int64),
"pitchf": pitchf.cpu().numpy().astype(np.float32),
"sid": sid.cpu().numpy().astype(np.int64),
"sid": sid.cpu().numpy().astype(np.int64)
},
)
else:
Expand All @@ -61,7 +62,7 @@ def infer(
"p_len": pitch_length.cpu().numpy().astype(np.int64),
"pitch": pitch.cpu().numpy().astype(np.int64),
"pitchf": pitchf.cpu().numpy().astype(np.float32),
"sid": sid.cpu().numpy().astype(np.int64),
"sid": sid.cpu().numpy().astype(np.int64)
},
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer


class OnnxRVCInferencerNono(OnnxRVCInferencer):
def loadModel(self, file: str, gpu: int):
super().loadModel(file, gpu)
Expand All @@ -18,6 +17,7 @@ def infer(
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
if self.isHalf:
audio1 = self.model.run(
Expand Down
3 changes: 2 additions & 1 deletion server/voice_changer/RVC/inferencer/RVCInferencer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ def infer(
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
3 changes: 2 additions & 1 deletion server/voice_changer/RVC/inferencer/RVCInferencerNono.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ def infer(
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, sid)
return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
3 changes: 2 additions & 1 deletion server/voice_changer/RVC/inferencer/RVCInferencerv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,6 @@ def infer(
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
3 changes: 2 additions & 1 deletion server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ def infer(
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, sid)
return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
1 change: 1 addition & 0 deletions server/voice_changer/RVC/inferencer/VorasInferencebeta.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,6 @@ def infer(
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
3 changes: 2 additions & 1 deletion server/voice_changer/RVC/inferencer/WebUIInferencer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ def infer(
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
3 changes: 2 additions & 1 deletion server/voice_changer/RVC/inferencer/WebUIInferencerNono.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ def infer(
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, sid)
return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
9 changes: 5 additions & 4 deletions server/voice_changer/RVC/inferencer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,16 @@ def forward(self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds): # 这
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)

def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)



class SynthesizerTrnMsNSFsidNono(nn.Module):
def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, emb_channels, sr=None, **kwargs):
super().__init__()
Expand Down Expand Up @@ -208,10 +209,10 @@ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[b
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)

def infer(self, phone, phone_lengths, sid, max_len=None):
def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)
Loading

0 comments on commit fa07de8

Please sign in to comment.