some cleanups

KoljaB · Nov 1, 2024 · 6f82e8b · 6f82e8b
1 parent 868d5f8
commit 6f82e8b
Show file tree

Hide file tree

Showing 13 changed files with 52 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ Let me know if you need any adjustments or additional languages!
 
 ## Updates
 
-Latest Version: v0.4.8
+Latest Version: v0.4.9
 
 See [release history](https://github.com/KoljaB/RealtimeTTS/releases).
 

diff --git a/RealtimeTTS/engines/coqui_engine.py b/RealtimeTTS/engines/coqui_engine.py
@@ -78,6 +78,7 @@ def __init__(
         comma_silence_duration=0.3,
         sentence_silence_duration=0.6,
         default_silence_duration=0.3,
+        print_realtime_factor=False,
     ):
         """
         Initializes a coqui voice realtime text to speech engine object.
@@ -162,6 +163,7 @@ def __init__(
         self.comma_silence_duration = comma_silence_duration
         self.sentence_silence_duration = sentence_silence_duration
         self.default_silence_duration = default_silence_duration
+        self.print_realtime_factor = print_realtime_factor
 
         self.cloning_reference_wav = voice
         self.speed = speed
@@ -258,6 +260,7 @@ def output_worker(queue):
                 self.comma_silence_duration,
                 self.sentence_silence_duration,
                 self.default_silence_duration,
+                self.print_realtime_factor,
             ),
         )
         self.synthesize_process.start()
@@ -298,6 +301,7 @@ def _synthesize_worker(
         comma_silence_duration,
         sentence_silence_duration,
         default_silence_duration,
+        print_realtime_factor,
     ):
         """
         Worker process for the coqui text to speech synthesis model.
@@ -724,8 +728,9 @@ def get_user_data_dir(appname):
                         raw_inference_factor = raw_inference_time / (
                             full_generated_seconds - first_chunk_length_seconds
                         )
-                        # print(realtime_factor)
-                        # print(raw_inference_factor)
+                        if print_realtime_factor:
+                            print(f"Realtime Factor: {realtime_factor}")
+                            print(f"Raw Inference Factor: {raw_inference_factor}")
 
                     # Send silent audio
                     sample_rate = config.audio.sample_rate

diff --git a/RealtimeTTS/engines/parler_engine.py b/RealtimeTTS/engines/parler_engine.py
@@ -115,36 +115,60 @@ def _generate_and_queue_audio(self, text: str):
             **self.voice_parameters,  # Merge with any additional voice parameters
         }
 
-        # Start the audio generation in a separate thread
-        generation_thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
-        generation_thread.start()
-
-        # Buffer audio for the desired duration before streaming
+        # Initialize variables for buffering
         audio_buffer = []
         buffer_length_s = 0.0
+        generation_completed = False
 
-        for new_audio in streamer:
-            if new_audio.shape[0] == 0:
-                break
+        # Start the audio generation (blocking call)
+        def generate_audio():
+            self.model.generate(**generation_kwargs)
 
-            audio_chunk = new_audio
-            audio_buffer.append(audio_chunk)
-            buffer_length_s += new_audio.shape[0] / sampling_rate
+        # Start the generation in a separate thread
+        generation_thread = Thread(target=generate_audio)
+        generation_thread.start()
 
-            # If we've buffered enough audio, start streaming the buffer
-            if buffer_length_s >= self.buffer_duration_s:
+        # Process the streamer in the main thread
+        while not generation_completed:
+            try:
+                new_audio = next(streamer)
+                if new_audio.shape[0] == 0:
+                    # Streamer signaled completion
+                    generation_completed = True
+                    break
+
+                audio_chunk = new_audio
+                audio_buffer.append(audio_chunk)
+                buffer_length_s += new_audio.shape[0] / sampling_rate
+
+                if buffer_length_s >= self.buffer_duration_s:
+                    # Buffering complete, start streaming
+                    break
+            except StopIteration:
+                # No more audio data
+                generation_completed = True
                 break
 
         # Queue the buffered audio chunks
         for buffered_chunk in audio_buffer:
             self.queue.put(buffered_chunk.tobytes())
 
-        # Stream audio in real-time as it's generated after buffering
-        for new_audio in streamer:
-            if new_audio.shape[0] == 0:
+        # Continue streaming the rest of the audio
+        while not generation_completed:
+            try:
+                new_audio = next(streamer)
+                if new_audio.shape[0] == 0:
+                    # Streamer signaled completion
+                    generation_completed = True
+                    break
+                audio_chunk = new_audio
+                self.queue.put(audio_chunk.tobytes())
+            except StopIteration:
+                generation_completed = True
                 break
-            audio_chunk = new_audio
-            self.queue.put(audio_chunk.tobytes())
+
+        # Ensure the generation thread has completed
+        generation_thread.join()
 
     def get_voices(self):
         """

diff --git a/setup.py b/setup.py
@@ -40,7 +40,7 @@ def parse_requirements(filename):
     + [requirements["elevenlabs"]]
     + [requirements["openai"]]
     + [requirements["gtts"]]
-    + [requirements["coqui_tts"]]
+    + [requirements["coqui_tts"]],
     "system": base_requirements + [requirements["pyttsx3"]],
     "azure": base_requirements + [requirements["azure-cognitiveservices-speech"]],
     "elevenlabs": base_requirements + [requirements["elevenlabs"]],
@@ -53,7 +53,7 @@ def parse_requirements(filename):
 
 setuptools.setup(
     name="RealTimeTTS",
-    version="0.4.8",
+    version="0.4.9",
     author="Kolja Beigel",
     author_email="[email protected]",
     description="Stream text into audio with an easy-to-use, highly configurable library delivering voice output with minimal latency.",

diff --git a/tests/synthesis_de_coqui.wav b/tests/synthesis_de_coqui.wav
diff --git a/tests/synthesis_en_coqui.wav b/tests/synthesis_en_coqui.wav
diff --git a/tests/synthesis_es_coqui.wav b/tests/synthesis_es_coqui.wav
diff --git a/tests/synthesis_fr_coqui.wav b/tests/synthesis_fr_coqui.wav
diff --git a/tests/synthesis_it_coqui.wav b/tests/synthesis_it_coqui.wav
diff --git a/tests/synthesis_ja_coqui.wav b/tests/synthesis_ja_coqui.wav
diff --git a/tests/synthesis_ko_coqui.wav b/tests/synthesis_ko_coqui.wav
diff --git a/tests/synthesis_pt_coqui.wav b/tests/synthesis_pt_coqui.wav
diff --git a/tests/synthesis_zh_coqui.wav b/tests/synthesis_zh_coqui.wav