support for multiple cloning source files for coqui engine

KoljaB · Dec 7, 2023 · 1ad11f0 · 1ad11f0
1 parent 69ffc72
commit 1ad11f0
Show file tree

Hide file tree

Showing 9 changed files with 130 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -26,13 +26,13 @@ https://github.com/KoljaB/RealtimeTTS/assets/7604638/87dcd9a5-3a4e-4f57-be45-837
 
 ## Updates
 
-Latest Version: v0.3.34
+Latest Version: v0.3.35
 
 #### New Features:
-- new Engine: OpenAI TTS
-- expanded language support, including Chinese (details in [tests](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/chinese_test.py) and [speed test](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/pyqt6_speed_test_chinese.py)).
-- fallback engines in TextToAudioStream, enhancing reliability for real-time scenarios by switching to alternate engines if one fails.
-- audio file saving feature with `output_wavfile` parameter. This allows for the simultaneous saving of real-time synthesized audio, enabling later playback of the live synthesis.
+- 💥NEW: multiple cloning files for ✨Coqui Engine✨
+- OpenAI TTS support
+- more languages ([chinese](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/chinese_test.py) etc)
+- fallback engines (define alternate engines if one fails)
 
 For more details, see the [release history](https://github.com/KoljaB/RealtimeTTS/releases).
 

diff --git a/RealtimeTTS/engines/coqui_default_voice.json b/RealtimeTTS/engines/coqui_default_voice.json
diff --git a/RealtimeTTS/engines/coqui_engine.py b/RealtimeTTS/engines/coqui_engine.py
@@ -1,5 +1,6 @@
 from multiprocessing import Process, Pipe, Event
 from .base_engine import BaseEngine
+from typing import Union, List
 from threading import Lock
 from tqdm import tqdm
 import numpy as np
@@ -19,7 +20,7 @@ def __init__(self,
                  specific_model = "2.0.2",
                  local_models_path = None, # specify a global model path here (otherwise it will create a directory "models" in the script directory)
                  voices_path = None,
-                 cloning_reference_wav: str = "",
+                 cloning_reference_wav: Union[str, List[str]] = "",
                  language = "en",
                  speed = 1.0,
                  thread_count = 6,
@@ -102,7 +103,7 @@ def post_init(self):
         self.engine_name = "coqui"
 
     @staticmethod
-    def _synthesize_worker(conn, model_name, cloning_reference_wav, language, ready_event, loglevel, speed, thread_count, stream_chunk_size, full_sentences, overlap_wav_len, temperature, length_penalty, repetition_penalty, top_k, top_p, enable_text_splitting, use_mps, local_model_path, use_deepspeed, voices_path):
+    def _synthesize_worker(conn, model_name, cloning_reference_wav: Union[str, List[str]], language, ready_event, loglevel, speed, thread_count, stream_chunk_size, full_sentences, overlap_wav_len, temperature, length_penalty, repetition_penalty, top_k, top_p, enable_text_splitting, use_mps, local_model_path, use_deepspeed, voices_path):
         """
         Worker process for the coqui text to speech synthesis model.
 
@@ -125,68 +126,116 @@ def _synthesize_worker(conn, model_name, cloning_reference_wav, language, ready_
         logging.info(f"Starting CoquiEngine")
 
 
-        def get_conditioning_latents(filename):
-            logging.debug(f"Computing speaker latents")
+        def get_conditioning_latents(filenames: Union[str, List[str]]):
+            """
+            Whoever reads this method.
+            I am sorry, it's a mess and in a terrible state.
+            It needs urgent rework but currently have other more important things to do.
+            """
+            if not isinstance(filenames, list):
+                filenames = [filenames]
 
-            if not filename or len(filename) == 0:
-                filename = "coqui_default_voice.wav"
-
-            # verify that filename ends with .wav
-            if filename.endswith(".json"):
-                filename_json = filename
-                filename = filename[:-5]
-                filename_wav = filename + "wav"
-            elif filename.endswith(".wav"):
-                filename_json = filename[:-3] + "json"
-                filename = filename[:-3]
-                filename_wav = filename + "wav"
-            else:
-                filename_json = filename + ".json"
-                filename_wav = filename + ".wav"
+            logging.debug(f"Computing speaker latents")
 
-            if voices_path:
-                filename_voice_wav = os.path.join(voices_path, filename_wav)
-                filename_voice_json = os.path.join(voices_path, filename_json)
-            else:
-                filename_voice_wav = filename_wav
-                filename_voice_json = filename_json
+            if len(filenames) == 0 or not filenames[0]:
+                logging.debug(f"Using coqui_default_voice.wav as default voice")
+                filenames = ["coqui_default_voice.wav"]
+
+            if len(filenames) == 1:
+                logging.debug(f"Old handling one voice file")
+                # verify that filename ends with .wav
+                filename = filenames[0]
+                if filename.endswith(".json"):
+                    filename_json = filename
+                    filename = filename[:-5]
+                    filename_wav = filename + "wav"
+                elif filename.endswith(".wav"):
+                    filename_json = filename[:-3] + "json"
+                    filename = filename[:-3]
+                    filename_wav = filename + "wav"
+                else:
+                    filename_json = filename + ".json"
+                    filename_wav = filename + ".wav"
 
-            if not os.path.exists(filename_voice_json) and not os.path.exists(filename_voice_wav):
-                if len(filename) > 0:
-                    logging.info(f"Using default female voice, both {filename_voice_json} and {filename_voice_wav} not found.")
+                if voices_path:
+                    filename_voice_wav = os.path.join(voices_path, filename_wav)
+                    filename_voice_json = os.path.join(voices_path, filename_json)
                 else:
-                    logging.info(f"Using default female voice, no cloning source specified.")
+                    filename_voice_wav = filename_wav
+                    filename_voice_json = filename_json
+
+                if not os.path.exists(filename_voice_json) and not os.path.exists(filename_voice_wav):
+                    if len(filename) > 0:
+                        logging.info(f"Using default female voice, both {filename_voice_json} and {filename_voice_wav} not found.")
+                    else:
+                        logging.info(f"Using default female voice, no cloning source specified.")
+
+                    # Get the directory of the current script
+                    current_dir = os.path.dirname(os.path.realpath(__file__))
+                    filename_voice_json = os.path.join(current_dir, "coqui_default_voice.json")
+                    if not os.path.exists(filename_voice_json):
+                        raise ValueError(f"Default voice file {filename_voice_json} not found.")                
+
+                # check if latents are already computed
+                if os.path.exists(filename_voice_json):
+                    logging.debug(f"Latents already computed, reading from {filename_voice_json}")
+                    with open(filename_voice_json, "r") as new_file:
+                        latents = json.load(new_file)
+
+                    speaker_embedding = (torch.tensor(latents["speaker_embedding"]).unsqueeze(0).unsqueeze(-1))
+                    gpt_cond_latent = (torch.tensor(latents["gpt_cond_latent"]).reshape((-1, 1024)).unsqueeze(0))                
+
+                    return gpt_cond_latent, speaker_embedding                
 
-                # Get the directory of the current script
-                current_dir = os.path.dirname(os.path.realpath(__file__))
-                filename_voice_json = os.path.join(current_dir, "coqui_default_voice.json")
-                if not os.path.exists(filename_voice_json):
-                    raise ValueError(f"Default voice file {filename_voice_json} not found.")                
+                # compute and write latents to json file
+                logging.debug(f"Computing latents for {filename}")
 
-            # check if latents are already computed
-            if os.path.exists(filename_voice_json):
-                logging.debug(f"Latents already computed, reading from {filename_voice_json}")
-                with open(filename_voice_json, "r") as new_file:
-                    latents = json.load(new_file)
+                gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=filename_voice_wav, gpt_cond_len=30, max_ref_length=60)
 
-                speaker_embedding = (torch.tensor(latents["speaker_embedding"]).unsqueeze(0).unsqueeze(-1))
-                gpt_cond_latent = (torch.tensor(latents["gpt_cond_latent"]).reshape((-1, 1024)).unsqueeze(0))                
+                latents = {
+                    "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
+                    "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
+                }
+                with open(filename_voice_json, "w") as new_file:
+                    json.dump(latents, new_file)
 
                 return gpt_cond_latent, speaker_embedding
 
-            # compute and write latents to json file
-            logging.debug(f"Computing latents for {filename}")
-
-            gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=filename_voice_wav, gpt_cond_len=30, max_ref_length=60)
-
-            latents = {
-                "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
-                "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
-            }
-            with open(filename_voice_json, "w") as new_file:
-                json.dump(latents, new_file)
+            else:
+                audio_path_list = []
+                for filename in filenames:
+                    # verify that filename ends with .wav
+                    if filename.endswith(".wav"):
+                        if voices_path:
+                            filename_voice_wav = os.path.join(voices_path, filename)
+                        else:
+                            filename_voice_wav = filename
+                        audio_path_list.append(filename_voice_wav)
+                        logging.debug(f"Added {filename_voice_wav} (#{len(audio_path_list)}) to audio_path_list")
+
+                if len(audio_path_list) == 0:
+                    logging.info(f"Using default female voice, no cloning source specified.")
+
+                    # Get the directory of the current script
+                    current_dir = os.path.dirname(os.path.realpath(__file__))
+                    filename_voice_json = os.path.join(current_dir, "coqui_default_voice.json")
+                    if not os.path.exists(filename_voice_json):
+                        raise ValueError(f"Default voice file {filename_voice_json} not found.")                
+
+                # compute and write latents to json file
+                logging.debug(f"Computing latents for {filename}")
+
+                gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=audio_path_list, gpt_cond_len=30, max_ref_length=60)
+
+                latents = {
+                    "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
+                    "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
+                }
+                filename_voice_json = audio_path_list[0][:-3] + "json"
+                with open(filename_voice_json, "w") as new_file:
+                    json.dump(latents, new_file)
 
-            return gpt_cond_latent, speaker_embedding
+                return gpt_cond_latent, speaker_embedding
 
         def postprocess_wave(chunk):
             """Post process the output waveform"""
@@ -266,6 +315,7 @@ def postprocess_wave(chunk):
 
                 if command == 'update_reference':
                     new_wav_path = data['cloning_reference_wav']
+                    logging.info(f'Updating reference WAV to {new_wav_path}')                    
                     gpt_cond_latent, speaker_embedding = get_conditioning_latents(new_wav_path)
                     conn.send(('success', 'Reference updated successfully'))
 
@@ -353,6 +403,8 @@ def set_cloning_reference(self, cloning_reference_wav: str):
         """
         Send an 'update_reference' command and wait for a response.
         """
+        if not isinstance(cloning_reference_wav, list):
+            cloning_reference_wav = [cloning_reference_wav]        
         self.send_command('update_reference', {'cloning_reference_wav': cloning_reference_wav})
 
         # Wait for the response from the worker process

diff --git a/RealtimeTTS/engines/female.json b/RealtimeTTS/engines/female.json
diff --git a/RealtimeTTS/engines/male.json b/RealtimeTTS/engines/male.json
diff --git a/RealtimeTTS/text_to_stream.py b/RealtimeTTS/text_to_stream.py
@@ -324,21 +324,23 @@ def synthesize_worker():
                 print (f"Error: {e}")
 
             finally:
-                self.abort_events.remove(abort_event)
-                self.player.stop()
+                try:
+
+                    self.player.stop()
 
-                self.stream_running = False
-                logging.info("stream stop")
-
-                if output_wavfile and self.wf:
-                    self.wf.close()
-                    self.wf = None
+                    self.abort_events.remove(abort_event)
+                    self.stream_running = False
+                    logging.info("stream stop")
 
-                self.output_wavfile = None
-                self.chunk_callback = None
+                    self.output_wavfile = None
+                    self.chunk_callback = None
 
-                if reset_generated_text and self.on_audio_stream_stop:
-                    self.on_audio_stream_stop()
+                    if reset_generated_text and self.on_audio_stream_stop:
+                        self.on_audio_stream_stop()
+                finally:
+                    if output_wavfile and self.wf:
+                        self.wf.close()
+                        self.wf = None
 
             if self.stream_running and len(self.char_iter.items) > 0 and self.char_iter.iterated_text == "":
                 # new text was feeded while playing audio but after the last character was processed

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ pyttsx3==2.90
 stream2sentence==0.2.2
 azure-cognitiveservices-speech==1.33.0
 elevenlabs==0.2.26
-TTS==0.21.1
+TTS==0.21.3
 tqdm==4.66.1
 pydub==0.25.1
 openai==1.3.6
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 
 setuptools.setup(
     name="RealTimeTTS", 
-    version="0.3.34",
+    version="0.3.35",
     author="Kolja Beigel",
     author_email="[email protected]",
     description="*Stream text into audio with an easy-to-use, highly configurable library delivering voice output with minimal latency.",

diff --git a/tests/male.json b/tests/male.json