From 537dc0e9333dbf6cad9738673288139edf4c1c6a Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Thu, 13 Apr 2023 00:38:38 +0200 Subject: [PATCH 1/5] Update VAD --- TTS/bin/remove_silence_using_vad.py | 39 ++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 352628bbc1..d73c0eda25 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,5 +1,6 @@ import argparse import glob +import multiprocessing import os import pathlib @@ -7,6 +8,10 @@ from TTS.utils.vad import get_vad_model_and_utils, remove_silence +import torch +torch.set_num_threads(1) + + def adjust_path_and_remove_silence(audio_path): output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) @@ -44,10 +49,24 @@ def preprocess_audios(): # create threads # num_threads = multiprocessing.cpu_count() # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15) - for f in tqdm(files): - output_path, is_speech = adjust_path_and_remove_silence(f) - if not is_speech: - filtered_files.append(output_path) + + if args.num_processes > 1: + with multiprocessing.Pool(processes=args.num_processes) as pool: + results = list( + tqdm( + pool.imap_unordered(adjust_path_and_remove_silence, files), + total=len(files), + desc="Processing audio files", + ) + ) + for output_path, is_speech in results: + if not is_speech: + filtered_files.append(output_path) + else: + for f in tqdm(files): + output_path, is_speech = adjust_path_and_remove_silence(f) + if not is_speech: + filtered_files.append(output_path) # write files that do not have speech with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f: @@ -87,6 +106,18 @@ def preprocess_audios(): default=False, help="If True use cuda", ) + parser.add_argument( + "--use_onnx", + type=bool, + default=False, + help="If True use onnx", + ) + parser.add_argument( + "--num_processes", + type=int, + default=1, + help="Number of processes to use", + ) args = parser.parse_args() # load the model and utils model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda) From 758ef84cc22cf24094853bc797b3ceddb0c48cba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 13 Apr 2023 14:14:41 +0200 Subject: [PATCH 2/5] =?UTF-8?q?Using=20=F0=9F=90=B8Studio=20models=20with?= =?UTF-8?q?=20`tts`=20command?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/api.py | 11 +++++++---- TTS/bin/synthesize.py | 24 ++++++++++++++++++++++-- TTS/utils/manage.py | 24 +++++++++++++++++++++++- tests/inference_tests/test_synthesize.py | 12 ++++++------ 4 files changed, 58 insertions(+), 13 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index b062874312..4e0731de5c 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -4,7 +4,7 @@ import tempfile import urllib.request from pathlib import Path -from typing import Tuple +from typing import Tuple, Union import numpy as np import requests @@ -86,7 +86,6 @@ def emotions(self): return ["Neutral", "Happy", "Sad", "Angry", "Dull"] def _check_token(self): - self.ping_api() if self.api_token is None: self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} @@ -183,6 +182,7 @@ def tts( language (str): Language of the text. If None, the default language of the speaker is used. """ self._check_token() + self.ping_api() if speaker_name is None and speaker_id is None: raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") if speaker_id is None: @@ -457,7 +457,7 @@ def tts_coqui_studio( emotion: str = "Neutral", speed: float = 1.0, file_path: str = None, - ): + ) -> Union[np.ndarray, str]: """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. Args: @@ -473,9 +473,12 @@ def tts_coqui_studio( Speed of the speech. Defaults to 1.0. file_path (str, optional): Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. + + Returns: + Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file. """ speaker_name = self.model_name.split("/")[2] - if file_path is None: + if file_path is not None: return self.csapi.tts_to_file( text=text, speaker_name=speaker_name, diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 2877ea2bde..fa49eeefc3 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -7,7 +7,9 @@ # pylint: disable=redefined-outer-name, unused-argument from pathlib import Path +from pprint import pprint +from TTS.api import TTS from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer @@ -183,6 +185,14 @@ def main(): ) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) + # args for coqui studio + parser.add_argument( + "--emotion", + type=str, + help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.", + default="Neutral", + ) + # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) @@ -285,6 +295,7 @@ def main(): # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path, progress_bar=args.progress_bar) + api = TTS() tts_path = None tts_config_path = None @@ -299,6 +310,7 @@ def main(): # CASE1 #list : list pre-trained TTS models if args.list_models: + manager.add_cs_api_models(api.list_models()) manager.list_models() sys.exit() @@ -313,7 +325,15 @@ def main(): manager.model_info_by_full_name(model_query_full_name) sys.exit() - # CASE3: load pre-trained model paths + # CASE3: TTS with coqui studio models + if "coqui_studio" in args.model_name: + print(" > Using 🐸Coqui Studio model: ", args.model_name) + api = TTS(model_name=args.model_name) + api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path) + print(" > Saving output to ", args.out_path) + return + + # CASE4: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) @@ -333,7 +353,7 @@ def main(): if args.vocoder_name is not None and not args.vocoder_path: vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - # CASE4: set custom model paths + # CASE5: set custom model paths if args.model_path is not None: tts_path = args.model_path tts_config_path = args.config_path diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 8419429dd1..8bf13bccd9 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -3,7 +3,7 @@ import zipfile from pathlib import Path from shutil import copyfile, rmtree -from typing import Dict, Tuple +from typing import Dict, List, Tuple import requests from tqdm import tqdm @@ -63,6 +63,28 @@ def read_models_file(self, file_path): with open(file_path, "r", encoding="utf-8") as json_file: self.models_dict = json.load(json_file) + def add_cs_api_models(self, model_list: List[str]): + """Add list of Coqui Studio model names that are returned from the api + + Each has the following format `/en//` + """ + + def _add_model(model_name: str): + if not "coqui_studio" in model_name: + return + model_type, lang, dataset, model = model_name.split("/") + if model_type not in self.models_dict: + self.models_dict[model_type] = {} + if lang not in self.models_dict[model_type]: + self.models_dict[model_type][lang] = {} + if dataset not in self.models_dict[model_type][lang]: + self.models_dict[model_type][lang][dataset] = {} + if model not in self.models_dict[model_type][lang][dataset]: + self.models_dict[model_type][lang][dataset][model] = {} + + for model_name in model_list: + _add_model(model_name) + def _list_models(self, model_type, model_count=0): if self.verbose: print(" Name format: type/language/dataset/model") diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 42b7717281..4bf751a5f9 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -19,9 +19,9 @@ def test_synthesize(): f'--text "This is an example." --out_path "{output_path}"' ) - # multi-speaker SC-Glow model - # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") - # run_cli( - # f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' - # f'--text "This is an example." --out_path "{output_path}"' - # ) + # 🐸 Coqui studio model + run_cli( + 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' + '--text "This is it" ' + f'--out_path "{output_path}"' + ) From c9375e4b8b8370878a7589644f2e40e750fd7460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 13 Apr 2023 14:17:06 +0200 Subject: [PATCH 3/5] Make style --- TTS/bin/synthesize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index fa49eeefc3..092264f40e 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -7,7 +7,6 @@ # pylint: disable=redefined-outer-name, unused-argument from pathlib import Path -from pprint import pprint from TTS.api import TTS from TTS.utils.manage import ModelManager From 5a9bda13f3664adcbe2e5d8be362a08a5d090b68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 13 Apr 2023 14:19:06 +0200 Subject: [PATCH 4/5] Make style --- TTS/bin/remove_silence_using_vad.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index d73c0eda25..d19e77872a 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -4,15 +4,14 @@ import os import pathlib +import torch from tqdm import tqdm from TTS.utils.vad import get_vad_model_and_utils, remove_silence -import torch torch.set_num_threads(1) - def adjust_path_and_remove_silence(audio_path): output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists From e4c5c27854782bcda5ee0539dac32a7e4c8fa735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 14 Apr 2023 10:23:39 +0200 Subject: [PATCH 5/5] Bump up to v0.13.2 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index c317a91891..9beb74d490 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.13.1 +0.13.2