Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
wataru committed Oct 28, 2022
1 parent fca3e10 commit 56235cf
Show file tree
Hide file tree
Showing 6 changed files with 4,931 additions and 21 deletions.
55 changes: 48 additions & 7 deletions demo/serverFastAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,19 @@
import numpy as np
from scipy.io.wavfile import write, read

sys.path.append("mod")
sys.path.append("mod/text")
# sys.path.append("mod")
# sys.path.append("mod/text")

sys.path.append("/MMVC_Trainer")
sys.path.append("/MMVC_Trainer/text")

import utils
import commons
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols

from mel_processing import spectrogram_torch
from text import text_to_sequence, cleaned_text_to_sequence

class VoiceChanger():
def __init__(self, config, model):
Expand Down Expand Up @@ -48,16 +53,52 @@ def on_request(self, gpu, srcId, dstId, timestamp, wav):
try:
if gpu<0 or self.gpu_num==0 :
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
# dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
# data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])

text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)

audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)


spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])

data = (text_norm, spec, audio_norm, sid)

data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
else:
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
# dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
# data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])

text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)

audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)


spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])

data = (text_norm, spec, audio_norm, sid)

data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
Expand Down
49 changes: 43 additions & 6 deletions demo/serverSIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,17 @@
import numpy as np
from scipy.io.wavfile import write

sys.path.append("mod")
sys.path.append("mod/text")
sys.path.append("/MMVC_Trainer")
sys.path.append("/MMVC_Trainer/text")


import utils
import commons
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from mel_processing import spectrogram_torch
from text import text_to_sequence, cleaned_text_to_sequence

class MyCustomNamespace(socketio.Namespace):
def __init__(self, namespace, config, model):
Expand Down Expand Up @@ -50,17 +55,49 @@ def on_request_message(self, sid, msg):

if gpu<0 or self.gpu_num==0 :
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])

text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)

audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)


spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])

data = (text_norm, spec, audio_norm, sid)

data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
else:
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])

text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)

audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)


spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])

data = (text_norm, spec, audio_norm, sid)
data = TextAudioSpeakerCollate()([data])

x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
Expand Down
14 changes: 13 additions & 1 deletion frontend/dist/index.html
Original file line number Diff line number Diff line change
@@ -1 +1,13 @@
<!doctype html><html lang="ja" style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>voice recorder</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div><noscript><strong>javascriptを有効にしてください</strong></noscript></body></html>
<!DOCTYPE html>
<html lang="ja" style="width: 100%; height: 100%; overflow: hidden">
<head>
<meta charset="utf-8" />
<title>voice recorder</title>
<script defer src="index.js"></script></head>
<body style="width: 100%; height: 100%; margin: 0px">
<div id="app" style="width: 100%; height: 100%"></div>
<noscript>
<strong>javascriptを有効にしてください</strong>
</noscript>
</body>
</html>
4,820 changes: 4,818 additions & 2 deletions frontend/dist/index.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion start2.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
set -eu

DOCKER_IMAGE=dannadori/voice-changer:20221003_002318
DOCKER_IMAGE=dannadori/voice-changer:20221028_191234
#DOCKER_IMAGE=voice-changer


Expand Down
12 changes: 8 additions & 4 deletions trainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM dannadori/voice-changer-internal:20221002_193031 as front
FROM dannadori/voice-changer-internal:20221028_190940 as front
FROM debian:bullseye-slim as base

ARG DEBIAN_FRONTEND=noninteractive
Expand All @@ -8,7 +8,7 @@ RUN apt-get install -y python3-pip git
RUN apt-get install -y espeak
RUN apt-get install -y cmake

RUN git clone --depth 1 https://github.com/isletennos/MMVC_Trainer.git -b v1.3.1.0
RUN git clone --depth 1 https://github.com/isletennos/MMVC_Trainer.git -b v1.3.1.3

RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113

Expand Down Expand Up @@ -49,18 +49,22 @@ COPY fine_model/D_180000.pth /MMVC_Trainer/fine_model/D_180000.pth

### Copy from base
COPY --from=base --chmod=777 /usr/local/lib/python3.9/dist-packages /usr/local/lib/python3.9/dist-packages
COPY --from=base --chmod=777 /MMVC_Trainer /MMVC_Trainer

### Copy from frontend
##### MMVC Trainer
COPY --from=front --chmod=777 /MMVC_Trainer /MMVC_Trainer
RUN chmod 0777 /MMVC_Trainer

WORKDIR /MMVC_Trainer
ADD /setup.sh /MMVC_Trainer/
ADD /exec.sh /MMVC_Trainer/

### Copy from frontend
##### Voice changer Internal
COPY --from=front --chmod=777 /voice-changer-internal/frontend/dist /voice-changer-internal/frontend/dist
COPY --from=front --chmod=777 /voice-changer-internal/voice-change-service /voice-changer-internal/voice-change-service
RUN chmod 0777 /voice-changer-internal/voice-change-service

##### Soft VC
COPY --from=front /hubert /hubert
COPY --from=front /acoustic-model /acoustic-model
COPY --from=front /hifigan /hifigan
Expand Down

0 comments on commit 56235cf

Please sign in to comment.