Skip to content

Commit

Permalink
support for multiple cloning source files for coqui engine
Browse files Browse the repository at this point in the history
  • Loading branch information
KoljaB committed Dec 7, 2023
1 parent 69ffc72 commit 1ad11f0
Show file tree
Hide file tree
Showing 9 changed files with 130 additions and 74 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ https://github.com/KoljaB/RealtimeTTS/assets/7604638/87dcd9a5-3a4e-4f57-be45-837
## Updates

Latest Version: v0.3.34
Latest Version: v0.3.35

#### New Features:
- new Engine: OpenAI TTS
- expanded language support, including Chinese (details in [tests](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/chinese_test.py) and [speed test](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/pyqt6_speed_test_chinese.py)).
- fallback engines in TextToAudioStream, enhancing reliability for real-time scenarios by switching to alternate engines if one fails.
- audio file saving feature with `output_wavfile` parameter. This allows for the simultaneous saving of real-time synthesized audio, enabling later playback of the live synthesis.
- 💥NEW: multiple cloning files for ✨Coqui Engine✨
- OpenAI TTS support
- more languages ([chinese](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/chinese_test.py) etc)
- fallback engines (define alternate engines if one fails)

For more details, see the [release history](https://github.com/KoljaB/RealtimeTTS/releases).

Expand Down
2 changes: 1 addition & 1 deletion RealtimeTTS/engines/coqui_default_voice.json

Large diffs are not rendered by default.

158 changes: 105 additions & 53 deletions RealtimeTTS/engines/coqui_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from multiprocessing import Process, Pipe, Event
from .base_engine import BaseEngine
from typing import Union, List
from threading import Lock
from tqdm import tqdm
import numpy as np
Expand All @@ -19,7 +20,7 @@ def __init__(self,
specific_model = "2.0.2",
local_models_path = None, # specify a global model path here (otherwise it will create a directory "models" in the script directory)
voices_path = None,
cloning_reference_wav: str = "",
cloning_reference_wav: Union[str, List[str]] = "",
language = "en",
speed = 1.0,
thread_count = 6,
Expand Down Expand Up @@ -102,7 +103,7 @@ def post_init(self):
self.engine_name = "coqui"

@staticmethod
def _synthesize_worker(conn, model_name, cloning_reference_wav, language, ready_event, loglevel, speed, thread_count, stream_chunk_size, full_sentences, overlap_wav_len, temperature, length_penalty, repetition_penalty, top_k, top_p, enable_text_splitting, use_mps, local_model_path, use_deepspeed, voices_path):
def _synthesize_worker(conn, model_name, cloning_reference_wav: Union[str, List[str]], language, ready_event, loglevel, speed, thread_count, stream_chunk_size, full_sentences, overlap_wav_len, temperature, length_penalty, repetition_penalty, top_k, top_p, enable_text_splitting, use_mps, local_model_path, use_deepspeed, voices_path):
"""
Worker process for the coqui text to speech synthesis model.
Expand All @@ -125,68 +126,116 @@ def _synthesize_worker(conn, model_name, cloning_reference_wav, language, ready_
logging.info(f"Starting CoquiEngine")


def get_conditioning_latents(filename):
logging.debug(f"Computing speaker latents")
def get_conditioning_latents(filenames: Union[str, List[str]]):
"""
Whoever reads this method.
I am sorry, it's a mess and in a terrible state.
It needs urgent rework but currently have other more important things to do.
"""
if not isinstance(filenames, list):
filenames = [filenames]

if not filename or len(filename) == 0:
filename = "coqui_default_voice.wav"

# verify that filename ends with .wav
if filename.endswith(".json"):
filename_json = filename
filename = filename[:-5]
filename_wav = filename + "wav"
elif filename.endswith(".wav"):
filename_json = filename[:-3] + "json"
filename = filename[:-3]
filename_wav = filename + "wav"
else:
filename_json = filename + ".json"
filename_wav = filename + ".wav"
logging.debug(f"Computing speaker latents")

if voices_path:
filename_voice_wav = os.path.join(voices_path, filename_wav)
filename_voice_json = os.path.join(voices_path, filename_json)
else:
filename_voice_wav = filename_wav
filename_voice_json = filename_json
if len(filenames) == 0 or not filenames[0]:
logging.debug(f"Using coqui_default_voice.wav as default voice")
filenames = ["coqui_default_voice.wav"]

if len(filenames) == 1:
logging.debug(f"Old handling one voice file")
# verify that filename ends with .wav
filename = filenames[0]
if filename.endswith(".json"):
filename_json = filename
filename = filename[:-5]
filename_wav = filename + "wav"
elif filename.endswith(".wav"):
filename_json = filename[:-3] + "json"
filename = filename[:-3]
filename_wav = filename + "wav"
else:
filename_json = filename + ".json"
filename_wav = filename + ".wav"

if not os.path.exists(filename_voice_json) and not os.path.exists(filename_voice_wav):
if len(filename) > 0:
logging.info(f"Using default female voice, both {filename_voice_json} and {filename_voice_wav} not found.")
if voices_path:
filename_voice_wav = os.path.join(voices_path, filename_wav)
filename_voice_json = os.path.join(voices_path, filename_json)
else:
logging.info(f"Using default female voice, no cloning source specified.")
filename_voice_wav = filename_wav
filename_voice_json = filename_json

if not os.path.exists(filename_voice_json) and not os.path.exists(filename_voice_wav):
if len(filename) > 0:
logging.info(f"Using default female voice, both {filename_voice_json} and {filename_voice_wav} not found.")
else:
logging.info(f"Using default female voice, no cloning source specified.")

# Get the directory of the current script
current_dir = os.path.dirname(os.path.realpath(__file__))
filename_voice_json = os.path.join(current_dir, "coqui_default_voice.json")
if not os.path.exists(filename_voice_json):
raise ValueError(f"Default voice file {filename_voice_json} not found.")

# check if latents are already computed
if os.path.exists(filename_voice_json):
logging.debug(f"Latents already computed, reading from {filename_voice_json}")
with open(filename_voice_json, "r") as new_file:
latents = json.load(new_file)

speaker_embedding = (torch.tensor(latents["speaker_embedding"]).unsqueeze(0).unsqueeze(-1))
gpt_cond_latent = (torch.tensor(latents["gpt_cond_latent"]).reshape((-1, 1024)).unsqueeze(0))

return gpt_cond_latent, speaker_embedding

# Get the directory of the current script
current_dir = os.path.dirname(os.path.realpath(__file__))
filename_voice_json = os.path.join(current_dir, "coqui_default_voice.json")
if not os.path.exists(filename_voice_json):
raise ValueError(f"Default voice file {filename_voice_json} not found.")
# compute and write latents to json file
logging.debug(f"Computing latents for {filename}")

# check if latents are already computed
if os.path.exists(filename_voice_json):
logging.debug(f"Latents already computed, reading from {filename_voice_json}")
with open(filename_voice_json, "r") as new_file:
latents = json.load(new_file)
gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=filename_voice_wav, gpt_cond_len=30, max_ref_length=60)

speaker_embedding = (torch.tensor(latents["speaker_embedding"]).unsqueeze(0).unsqueeze(-1))
gpt_cond_latent = (torch.tensor(latents["gpt_cond_latent"]).reshape((-1, 1024)).unsqueeze(0))
latents = {
"gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
"speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
}
with open(filename_voice_json, "w") as new_file:
json.dump(latents, new_file)

return gpt_cond_latent, speaker_embedding

# compute and write latents to json file
logging.debug(f"Computing latents for {filename}")

gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=filename_voice_wav, gpt_cond_len=30, max_ref_length=60)

latents = {
"gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
"speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
}
with open(filename_voice_json, "w") as new_file:
json.dump(latents, new_file)
else:
audio_path_list = []
for filename in filenames:
# verify that filename ends with .wav
if filename.endswith(".wav"):
if voices_path:
filename_voice_wav = os.path.join(voices_path, filename)
else:
filename_voice_wav = filename
audio_path_list.append(filename_voice_wav)
logging.debug(f"Added {filename_voice_wav} (#{len(audio_path_list)}) to audio_path_list")

if len(audio_path_list) == 0:
logging.info(f"Using default female voice, no cloning source specified.")

# Get the directory of the current script
current_dir = os.path.dirname(os.path.realpath(__file__))
filename_voice_json = os.path.join(current_dir, "coqui_default_voice.json")
if not os.path.exists(filename_voice_json):
raise ValueError(f"Default voice file {filename_voice_json} not found.")

# compute and write latents to json file
logging.debug(f"Computing latents for {filename}")

gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=audio_path_list, gpt_cond_len=30, max_ref_length=60)

latents = {
"gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
"speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
}
filename_voice_json = audio_path_list[0][:-3] + "json"
with open(filename_voice_json, "w") as new_file:
json.dump(latents, new_file)

return gpt_cond_latent, speaker_embedding
return gpt_cond_latent, speaker_embedding

def postprocess_wave(chunk):
"""Post process the output waveform"""
Expand Down Expand Up @@ -266,6 +315,7 @@ def postprocess_wave(chunk):

if command == 'update_reference':
new_wav_path = data['cloning_reference_wav']
logging.info(f'Updating reference WAV to {new_wav_path}')
gpt_cond_latent, speaker_embedding = get_conditioning_latents(new_wav_path)
conn.send(('success', 'Reference updated successfully'))

Expand Down Expand Up @@ -353,6 +403,8 @@ def set_cloning_reference(self, cloning_reference_wav: str):
"""
Send an 'update_reference' command and wait for a response.
"""
if not isinstance(cloning_reference_wav, list):
cloning_reference_wav = [cloning_reference_wav]
self.send_command('update_reference', {'cloning_reference_wav': cloning_reference_wav})

# Wait for the response from the worker process
Expand Down
1 change: 1 addition & 0 deletions RealtimeTTS/engines/female.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions RealtimeTTS/engines/male.json

Large diffs are not rendered by default.

26 changes: 14 additions & 12 deletions RealtimeTTS/text_to_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,21 +324,23 @@ def synthesize_worker():
print (f"Error: {e}")

finally:
self.abort_events.remove(abort_event)
self.player.stop()
try:

self.player.stop()

self.stream_running = False
logging.info("stream stop")

if output_wavfile and self.wf:
self.wf.close()
self.wf = None
self.abort_events.remove(abort_event)
self.stream_running = False
logging.info("stream stop")

self.output_wavfile = None
self.chunk_callback = None
self.output_wavfile = None
self.chunk_callback = None

if reset_generated_text and self.on_audio_stream_stop:
self.on_audio_stream_stop()
if reset_generated_text and self.on_audio_stream_stop:
self.on_audio_stream_stop()
finally:
if output_wavfile and self.wf:
self.wf.close()
self.wf = None

if self.stream_running and len(self.char_iter.items) > 0 and self.char_iter.iterated_text == "":
# new text was feeded while playing audio but after the last character was processed
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ pyttsx3==2.90
stream2sentence==0.2.2
azure-cognitiveservices-speech==1.33.0
elevenlabs==0.2.26
TTS==0.21.1
TTS==0.21.3
tqdm==4.66.1
pydub==0.25.1
openai==1.3.6
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

setuptools.setup(
name="RealTimeTTS",
version="0.3.34",
version="0.3.35",
author="Kolja Beigel",
author_email="[email protected]",
description="*Stream text into audio with an easy-to-use, highly configurable library delivering voice output with minimal latency.",
Expand Down
2 changes: 1 addition & 1 deletion tests/male.json

Large diffs are not rendered by default.

0 comments on commit 1ad11f0

Please sign in to comment.