diff --git a/README.md b/README.md index 520cc5a..57e8000 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Let me know if you need any adjustments or additional languages! ## Updates -Latest Version: v0.4.8 +Latest Version: v0.4.9 See [release history](https://github.com/KoljaB/RealtimeTTS/releases). diff --git a/RealtimeTTS/engines/coqui_engine.py b/RealtimeTTS/engines/coqui_engine.py index a84c4c3..0a4b210 100644 --- a/RealtimeTTS/engines/coqui_engine.py +++ b/RealtimeTTS/engines/coqui_engine.py @@ -78,6 +78,7 @@ def __init__( comma_silence_duration=0.3, sentence_silence_duration=0.6, default_silence_duration=0.3, + print_realtime_factor=False, ): """ Initializes a coqui voice realtime text to speech engine object. @@ -162,6 +163,7 @@ def __init__( self.comma_silence_duration = comma_silence_duration self.sentence_silence_duration = sentence_silence_duration self.default_silence_duration = default_silence_duration + self.print_realtime_factor = print_realtime_factor self.cloning_reference_wav = voice self.speed = speed @@ -258,6 +260,7 @@ def output_worker(queue): self.comma_silence_duration, self.sentence_silence_duration, self.default_silence_duration, + self.print_realtime_factor, ), ) self.synthesize_process.start() @@ -298,6 +301,7 @@ def _synthesize_worker( comma_silence_duration, sentence_silence_duration, default_silence_duration, + print_realtime_factor, ): """ Worker process for the coqui text to speech synthesis model. @@ -724,8 +728,9 @@ def get_user_data_dir(appname): raw_inference_factor = raw_inference_time / ( full_generated_seconds - first_chunk_length_seconds ) - # print(realtime_factor) - # print(raw_inference_factor) + if print_realtime_factor: + print(f"Realtime Factor: {realtime_factor}") + print(f"Raw Inference Factor: {raw_inference_factor}") # Send silent audio sample_rate = config.audio.sample_rate diff --git a/RealtimeTTS/engines/parler_engine.py b/RealtimeTTS/engines/parler_engine.py index b3fdf95..58e9c7a 100644 --- a/RealtimeTTS/engines/parler_engine.py +++ b/RealtimeTTS/engines/parler_engine.py @@ -115,36 +115,60 @@ def _generate_and_queue_audio(self, text: str): **self.voice_parameters, # Merge with any additional voice parameters } - # Start the audio generation in a separate thread - generation_thread = Thread(target=self.model.generate, kwargs=generation_kwargs) - generation_thread.start() - - # Buffer audio for the desired duration before streaming + # Initialize variables for buffering audio_buffer = [] buffer_length_s = 0.0 + generation_completed = False - for new_audio in streamer: - if new_audio.shape[0] == 0: - break + # Start the audio generation (blocking call) + def generate_audio(): + self.model.generate(**generation_kwargs) - audio_chunk = new_audio - audio_buffer.append(audio_chunk) - buffer_length_s += new_audio.shape[0] / sampling_rate + # Start the generation in a separate thread + generation_thread = Thread(target=generate_audio) + generation_thread.start() - # If we've buffered enough audio, start streaming the buffer - if buffer_length_s >= self.buffer_duration_s: + # Process the streamer in the main thread + while not generation_completed: + try: + new_audio = next(streamer) + if new_audio.shape[0] == 0: + # Streamer signaled completion + generation_completed = True + break + + audio_chunk = new_audio + audio_buffer.append(audio_chunk) + buffer_length_s += new_audio.shape[0] / sampling_rate + + if buffer_length_s >= self.buffer_duration_s: + # Buffering complete, start streaming + break + except StopIteration: + # No more audio data + generation_completed = True break # Queue the buffered audio chunks for buffered_chunk in audio_buffer: self.queue.put(buffered_chunk.tobytes()) - # Stream audio in real-time as it's generated after buffering - for new_audio in streamer: - if new_audio.shape[0] == 0: + # Continue streaming the rest of the audio + while not generation_completed: + try: + new_audio = next(streamer) + if new_audio.shape[0] == 0: + # Streamer signaled completion + generation_completed = True + break + audio_chunk = new_audio + self.queue.put(audio_chunk.tobytes()) + except StopIteration: + generation_completed = True break - audio_chunk = new_audio - self.queue.put(audio_chunk.tobytes()) + + # Ensure the generation thread has completed + generation_thread.join() def get_voices(self): """ diff --git a/setup.py b/setup.py index 01b3e12..9069f1b 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ def parse_requirements(filename): + [requirements["elevenlabs"]] + [requirements["openai"]] + [requirements["gtts"]] - + [requirements["coqui_tts"]] + + [requirements["coqui_tts"]], "system": base_requirements + [requirements["pyttsx3"]], "azure": base_requirements + [requirements["azure-cognitiveservices-speech"]], "elevenlabs": base_requirements + [requirements["elevenlabs"]], @@ -53,7 +53,7 @@ def parse_requirements(filename): setuptools.setup( name="RealTimeTTS", - version="0.4.8", + version="0.4.9", author="Kolja Beigel", author_email="kolja.beigel@web.de", description="Stream text into audio with an easy-to-use, highly configurable library delivering voice output with minimal latency.", diff --git a/tests/synthesis_de_coqui.wav b/tests/synthesis_de_coqui.wav deleted file mode 100644 index 76ff8e1..0000000 Binary files a/tests/synthesis_de_coqui.wav and /dev/null differ diff --git a/tests/synthesis_en_coqui.wav b/tests/synthesis_en_coqui.wav deleted file mode 100644 index 797413b..0000000 Binary files a/tests/synthesis_en_coqui.wav and /dev/null differ diff --git a/tests/synthesis_es_coqui.wav b/tests/synthesis_es_coqui.wav deleted file mode 100644 index 0c34ae2..0000000 Binary files a/tests/synthesis_es_coqui.wav and /dev/null differ diff --git a/tests/synthesis_fr_coqui.wav b/tests/synthesis_fr_coqui.wav deleted file mode 100644 index f5983e8..0000000 Binary files a/tests/synthesis_fr_coqui.wav and /dev/null differ diff --git a/tests/synthesis_it_coqui.wav b/tests/synthesis_it_coqui.wav deleted file mode 100644 index 43c7b08..0000000 Binary files a/tests/synthesis_it_coqui.wav and /dev/null differ diff --git a/tests/synthesis_ja_coqui.wav b/tests/synthesis_ja_coqui.wav deleted file mode 100644 index 1c54fe8..0000000 Binary files a/tests/synthesis_ja_coqui.wav and /dev/null differ diff --git a/tests/synthesis_ko_coqui.wav b/tests/synthesis_ko_coqui.wav deleted file mode 100644 index b1badf5..0000000 Binary files a/tests/synthesis_ko_coqui.wav and /dev/null differ diff --git a/tests/synthesis_pt_coqui.wav b/tests/synthesis_pt_coqui.wav deleted file mode 100644 index 5736ef4..0000000 Binary files a/tests/synthesis_pt_coqui.wav and /dev/null differ diff --git a/tests/synthesis_zh_coqui.wav b/tests/synthesis_zh_coqui.wav deleted file mode 100644 index 92b3213..0000000 Binary files a/tests/synthesis_zh_coqui.wav and /dev/null differ