Skip to content

Commit

Permalink
Merge pull request #2519 from coqui-ai/dev
Browse files Browse the repository at this point in the history
🌈 v0.13.2
  • Loading branch information
erogol authored Apr 14, 2023
2 parents bb8d080 + e4c5c27 commit b3b4034
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 18 deletions.
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.13.1
0.13.2
11 changes: 7 additions & 4 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
import urllib.request
from pathlib import Path
from typing import Tuple
from typing import Tuple, Union

import numpy as np
import requests
Expand Down Expand Up @@ -86,7 +86,6 @@ def emotions(self):
return ["Neutral", "Happy", "Sad", "Angry", "Dull"]

def _check_token(self):
self.ping_api()
if self.api_token is None:
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
Expand Down Expand Up @@ -183,6 +182,7 @@ def tts(
language (str): Language of the text. If None, the default language of the speaker is used.
"""
self._check_token()
self.ping_api()
if speaker_name is None and speaker_id is None:
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
if speaker_id is None:
Expand Down Expand Up @@ -457,7 +457,7 @@ def tts_coqui_studio(
emotion: str = "Neutral",
speed: float = 1.0,
file_path: str = None,
):
) -> Union[np.ndarray, str]:
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
Args:
Expand All @@ -473,9 +473,12 @@ def tts_coqui_studio(
Speed of the speech. Defaults to 1.0.
file_path (str, optional):
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
Returns:
Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
"""
speaker_name = self.model_name.split("/")[2]
if file_path is None:
if file_path is not None:
return self.csapi.tts_to_file(
text=text,
speaker_name=speaker_name,
Expand Down
38 changes: 34 additions & 4 deletions TTS/bin/remove_silence_using_vad.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import argparse
import glob
import multiprocessing
import os
import pathlib

import torch
from tqdm import tqdm

from TTS.utils.vad import get_vad_model_and_utils, remove_silence

torch.set_num_threads(1)


def adjust_path_and_remove_silence(audio_path):
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
Expand Down Expand Up @@ -44,10 +48,24 @@ def preprocess_audios():
# create threads
# num_threads = multiprocessing.cpu_count()
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
for f in tqdm(files):
output_path, is_speech = adjust_path_and_remove_silence(f)
if not is_speech:
filtered_files.append(output_path)

if args.num_processes > 1:
with multiprocessing.Pool(processes=args.num_processes) as pool:
results = list(
tqdm(
pool.imap_unordered(adjust_path_and_remove_silence, files),
total=len(files),
desc="Processing audio files",
)
)
for output_path, is_speech in results:
if not is_speech:
filtered_files.append(output_path)
else:
for f in tqdm(files):
output_path, is_speech = adjust_path_and_remove_silence(f)
if not is_speech:
filtered_files.append(output_path)

# write files that do not have speech
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
Expand Down Expand Up @@ -87,6 +105,18 @@ def preprocess_audios():
default=False,
help="If True use cuda",
)
parser.add_argument(
"--use_onnx",
type=bool,
default=False,
help="If True use onnx",
)
parser.add_argument(
"--num_processes",
type=int,
default=1,
help="Number of processes to use",
)
args = parser.parse_args()
# load the model and utils
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
Expand Down
23 changes: 21 additions & 2 deletions TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# pylint: disable=redefined-outer-name, unused-argument
from pathlib import Path

from TTS.api import TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

Expand Down Expand Up @@ -183,6 +184,14 @@ def main():
)
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)

# args for coqui studio
parser.add_argument(
"--emotion",
type=str,
help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.",
default="Neutral",
)

# args for multi-speaker synthesis
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
Expand Down Expand Up @@ -285,6 +294,7 @@ def main():
# load model manager
path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path, progress_bar=args.progress_bar)
api = TTS()

tts_path = None
tts_config_path = None
Expand All @@ -299,6 +309,7 @@ def main():

# CASE1 #list : list pre-trained TTS models
if args.list_models:
manager.add_cs_api_models(api.list_models())
manager.list_models()
sys.exit()

Expand All @@ -313,7 +324,15 @@ def main():
manager.model_info_by_full_name(model_query_full_name)
sys.exit()

# CASE3: load pre-trained model paths
# CASE3: TTS with coqui studio models
if "coqui_studio" in args.model_name:
print(" > Using 🐸Coqui Studio model: ", args.model_name)
api = TTS(model_name=args.model_name)
api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path)
print(" > Saving output to ", args.out_path)
return

# CASE4: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)

Expand All @@ -333,7 +352,7 @@ def main():
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

# CASE4: set custom model paths
# CASE5: set custom model paths
if args.model_path is not None:
tts_path = args.model_path
tts_config_path = args.config_path
Expand Down
24 changes: 23 additions & 1 deletion TTS/utils/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import zipfile
from pathlib import Path
from shutil import copyfile, rmtree
from typing import Dict, Tuple
from typing import Dict, List, Tuple

import requests
from tqdm import tqdm
Expand Down Expand Up @@ -63,6 +63,28 @@ def read_models_file(self, file_path):
with open(file_path, "r", encoding="utf-8") as json_file:
self.models_dict = json.load(json_file)

def add_cs_api_models(self, model_list: List[str]):
"""Add list of Coqui Studio model names that are returned from the api
Each has the following format `<coqui_studio_model>/en/<speaker_name>/<coqui_studio_model>`
"""

def _add_model(model_name: str):
if not "coqui_studio" in model_name:
return
model_type, lang, dataset, model = model_name.split("/")
if model_type not in self.models_dict:
self.models_dict[model_type] = {}
if lang not in self.models_dict[model_type]:
self.models_dict[model_type][lang] = {}
if dataset not in self.models_dict[model_type][lang]:
self.models_dict[model_type][lang][dataset] = {}
if model not in self.models_dict[model_type][lang][dataset]:
self.models_dict[model_type][lang][dataset][model] = {}

for model_name in model_list:
_add_model(model_name)

def _list_models(self, model_type, model_count=0):
if self.verbose:
print(" Name format: type/language/dataset/model")
Expand Down
12 changes: 6 additions & 6 deletions tests/inference_tests/test_synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def test_synthesize():
f'--text "This is an example." --out_path "{output_path}"'
)

# multi-speaker SC-Glow model
# run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs")
# run_cli(
# f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" '
# f'--text "This is an example." --out_path "{output_path}"'
# )
# 🐸 Coqui studio model
run_cli(
'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
'--text "This is it" '
f'--out_path "{output_path}"'
)

0 comments on commit b3b4034

Please sign in to comment.