From 537dc0e9333dbf6cad9738673288139edf4c1c6a Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Thu, 13 Apr 2023 00:38:38 +0200
Subject: [PATCH 1/5] Update VAD

---
 TTS/bin/remove_silence_using_vad.py | 39 ++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
index 352628bbc1..d73c0eda25 100755
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@@ -1,5 +1,6 @@
 import argparse
 import glob
+import multiprocessing
 import os
 import pathlib
 
@@ -7,6 +8,10 @@
 
 from TTS.utils.vad import get_vad_model_and_utils, remove_silence
 
+import torch
+torch.set_num_threads(1)
+
+
 
 def adjust_path_and_remove_silence(audio_path):
     output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
@@ -44,10 +49,24 @@ def preprocess_audios():
         # create threads
         # num_threads = multiprocessing.cpu_count()
         # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
-        for f in tqdm(files):
-            output_path, is_speech = adjust_path_and_remove_silence(f)
-            if not is_speech:
-                filtered_files.append(output_path)
+
+        if args.num_processes > 1:
+            with multiprocessing.Pool(processes=args.num_processes) as pool:
+                results = list(
+                    tqdm(
+                        pool.imap_unordered(adjust_path_and_remove_silence, files),
+                        total=len(files),
+                        desc="Processing audio files",
+                    )
+                )
+            for output_path, is_speech in results:
+                if not is_speech:
+                    filtered_files.append(output_path)
+        else:
+            for f in tqdm(files):
+                output_path, is_speech = adjust_path_and_remove_silence(f)
+                if not is_speech:
+                    filtered_files.append(output_path)
 
         # write files that do not have speech
         with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
@@ -87,6 +106,18 @@ def preprocess_audios():
         default=False,
         help="If True use cuda",
     )
+    parser.add_argument(
+        "--use_onnx",
+        type=bool,
+        default=False,
+        help="If True use onnx",
+    )
+    parser.add_argument(
+        "--num_processes",
+        type=int,
+        default=1,
+        help="Number of processes to use",
+    )
     args = parser.parse_args()
     # load the model and utils
     model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)

From 758ef84cc22cf24094853bc797b3ceddb0c48cba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 13 Apr 2023 14:14:41 +0200
Subject: [PATCH 2/5] =?UTF-8?q?Using=20=F0=9F=90=B8Studio=20models=20with?=
 =?UTF-8?q?=20`tts`=20command?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 TTS/api.py                               | 11 +++++++----
 TTS/bin/synthesize.py                    | 24 ++++++++++++++++++++++--
 TTS/utils/manage.py                      | 24 +++++++++++++++++++++++-
 tests/inference_tests/test_synthesize.py | 12 ++++++------
 4 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index b062874312..4e0731de5c 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -4,7 +4,7 @@
 import tempfile
 import urllib.request
 from pathlib import Path
-from typing import Tuple
+from typing import Tuple, Union
 
 import numpy as np
 import requests
@@ -86,7 +86,6 @@ def emotions(self):
         return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
 
     def _check_token(self):
-        self.ping_api()
         if self.api_token is None:
             self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
             self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
@@ -183,6 +182,7 @@ def tts(
             language (str): Language of the text. If None, the default language of the speaker is used.
         """
         self._check_token()
+        self.ping_api()
         if speaker_name is None and speaker_id is None:
             raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
         if speaker_id is None:
@@ -457,7 +457,7 @@ def tts_coqui_studio(
         emotion: str = "Neutral",
         speed: float = 1.0,
         file_path: str = None,
-    ):
+    ) -> Union[np.ndarray, str]:
         """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
 
         Args:
@@ -473,9 +473,12 @@ def tts_coqui_studio(
                 Speed of the speech. Defaults to 1.0.
             file_path (str, optional):
                 Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
+
+        Returns:
+            Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
         """
         speaker_name = self.model_name.split("/")[2]
-        if file_path is None:
+        if file_path is not None:
             return self.csapi.tts_to_file(
                 text=text,
                 speaker_name=speaker_name,
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 2877ea2bde..fa49eeefc3 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -7,7 +7,9 @@
 
 # pylint: disable=redefined-outer-name, unused-argument
 from pathlib import Path
+from pprint import pprint
 
+from TTS.api import TTS
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 
@@ -183,6 +185,14 @@ def main():
     )
     parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
 
+    # args for coqui studio
+    parser.add_argument(
+        "--emotion",
+        type=str,
+        help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.",
+        default="Neutral",
+    )
+
     # args for multi-speaker synthesis
     parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
     parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
@@ -285,6 +295,7 @@ def main():
     # load model manager
     path = Path(__file__).parent / "../.models.json"
     manager = ModelManager(path, progress_bar=args.progress_bar)
+    api = TTS()
 
     tts_path = None
     tts_config_path = None
@@ -299,6 +310,7 @@ def main():
 
     # CASE1 #list : list pre-trained TTS models
     if args.list_models:
+        manager.add_cs_api_models(api.list_models())
         manager.list_models()
         sys.exit()
 
@@ -313,7 +325,15 @@ def main():
         manager.model_info_by_full_name(model_query_full_name)
         sys.exit()
 
-    # CASE3: load pre-trained model paths
+    # CASE3: TTS with coqui studio models
+    if "coqui_studio" in args.model_name:
+        print(" > Using 🐸Coqui Studio model: ", args.model_name)
+        api = TTS(model_name=args.model_name)
+        api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path)
+        print(" > Saving output to ", args.out_path)
+        return
+
+    # CASE4: load pre-trained model paths
     if args.model_name is not None and not args.model_path:
         model_path, config_path, model_item = manager.download_model(args.model_name)
 
@@ -333,7 +353,7 @@ def main():
     if args.vocoder_name is not None and not args.vocoder_path:
         vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
 
-    # CASE4: set custom model paths
+    # CASE5: set custom model paths
     if args.model_path is not None:
         tts_path = args.model_path
         tts_config_path = args.config_path
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 8419429dd1..8bf13bccd9 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -3,7 +3,7 @@
 import zipfile
 from pathlib import Path
 from shutil import copyfile, rmtree
-from typing import Dict, Tuple
+from typing import Dict, List, Tuple
 
 import requests
 from tqdm import tqdm
@@ -63,6 +63,28 @@ def read_models_file(self, file_path):
         with open(file_path, "r", encoding="utf-8") as json_file:
             self.models_dict = json.load(json_file)
 
+    def add_cs_api_models(self, model_list: List[str]):
+        """Add list of Coqui Studio model names that are returned from the api
+
+        Each has the following format `<coqui_studio_model>/en/<speaker_name>/<coqui_studio_model>`
+        """
+
+        def _add_model(model_name: str):
+            if not "coqui_studio" in model_name:
+                return
+            model_type, lang, dataset, model = model_name.split("/")
+            if model_type not in self.models_dict:
+                self.models_dict[model_type] = {}
+            if lang not in self.models_dict[model_type]:
+                self.models_dict[model_type][lang] = {}
+            if dataset not in self.models_dict[model_type][lang]:
+                self.models_dict[model_type][lang][dataset] = {}
+            if model not in self.models_dict[model_type][lang][dataset]:
+                self.models_dict[model_type][lang][dataset][model] = {}
+
+        for model_name in model_list:
+            _add_model(model_name)
+
     def _list_models(self, model_type, model_count=0):
         if self.verbose:
             print(" Name format: type/language/dataset/model")
diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py
index 42b7717281..4bf751a5f9 100644
--- a/tests/inference_tests/test_synthesize.py
+++ b/tests/inference_tests/test_synthesize.py
@@ -19,9 +19,9 @@ def test_synthesize():
         f'--text "This is an example." --out_path "{output_path}"'
     )
 
-    # multi-speaker SC-Glow model
-    # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs")
-    # run_cli(
-    #     f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" '
-    #     f'--text "This is an example." --out_path "{output_path}"'
-    # )
+    # 🐸 Coqui studio model
+    run_cli(
+        'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
+        '--text "This is it" '
+        f'--out_path "{output_path}"'
+    )

From c9375e4b8b8370878a7589644f2e40e750fd7460 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 13 Apr 2023 14:17:06 +0200
Subject: [PATCH 3/5] Make style

---
 TTS/bin/synthesize.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index fa49eeefc3..092264f40e 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -7,7 +7,6 @@
 
 # pylint: disable=redefined-outer-name, unused-argument
 from pathlib import Path
-from pprint import pprint
 
 from TTS.api import TTS
 from TTS.utils.manage import ModelManager

From 5a9bda13f3664adcbe2e5d8be362a08a5d090b68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 13 Apr 2023 14:19:06 +0200
Subject: [PATCH 4/5] Make style

---
 TTS/bin/remove_silence_using_vad.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
index d73c0eda25..d19e77872a 100755
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@@ -4,15 +4,14 @@
 import os
 import pathlib
 
+import torch
 from tqdm import tqdm
 
 from TTS.utils.vad import get_vad_model_and_utils, remove_silence
 
-import torch
 torch.set_num_threads(1)
 
 
-
 def adjust_path_and_remove_silence(audio_path):
     output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
     # ignore if the file exists

From e4c5c27854782bcda5ee0539dac32a7e4c8fa735 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Fri, 14 Apr 2023 10:23:39 +0200
Subject: [PATCH 5/5] Bump up to v0.13.2

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index c317a91891..9beb74d490 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.13.1
+0.13.2