Skip to content

Commit

Permalink
some cleanups
Browse files Browse the repository at this point in the history
  • Loading branch information
KoljaB committed Nov 1, 2024
1 parent 868d5f8 commit 6f82e8b
Show file tree
Hide file tree
Showing 13 changed files with 52 additions and 23 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Let me know if you need any adjustments or additional languages!

## Updates

Latest Version: v0.4.8
Latest Version: v0.4.9

See [release history](https://github.com/KoljaB/RealtimeTTS/releases).

Expand Down
9 changes: 7 additions & 2 deletions RealtimeTTS/engines/coqui_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def __init__(
comma_silence_duration=0.3,
sentence_silence_duration=0.6,
default_silence_duration=0.3,
print_realtime_factor=False,
):
"""
Initializes a coqui voice realtime text to speech engine object.
Expand Down Expand Up @@ -162,6 +163,7 @@ def __init__(
self.comma_silence_duration = comma_silence_duration
self.sentence_silence_duration = sentence_silence_duration
self.default_silence_duration = default_silence_duration
self.print_realtime_factor = print_realtime_factor

self.cloning_reference_wav = voice
self.speed = speed
Expand Down Expand Up @@ -258,6 +260,7 @@ def output_worker(queue):
self.comma_silence_duration,
self.sentence_silence_duration,
self.default_silence_duration,
self.print_realtime_factor,
),
)
self.synthesize_process.start()
Expand Down Expand Up @@ -298,6 +301,7 @@ def _synthesize_worker(
comma_silence_duration,
sentence_silence_duration,
default_silence_duration,
print_realtime_factor,
):
"""
Worker process for the coqui text to speech synthesis model.
Expand Down Expand Up @@ -724,8 +728,9 @@ def get_user_data_dir(appname):
raw_inference_factor = raw_inference_time / (
full_generated_seconds - first_chunk_length_seconds
)
# print(realtime_factor)
# print(raw_inference_factor)
if print_realtime_factor:
print(f"Realtime Factor: {realtime_factor}")
print(f"Raw Inference Factor: {raw_inference_factor}")

# Send silent audio
sample_rate = config.audio.sample_rate
Expand Down
60 changes: 42 additions & 18 deletions RealtimeTTS/engines/parler_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,36 +115,60 @@ def _generate_and_queue_audio(self, text: str):
**self.voice_parameters, # Merge with any additional voice parameters
}

# Start the audio generation in a separate thread
generation_thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
generation_thread.start()

# Buffer audio for the desired duration before streaming
# Initialize variables for buffering
audio_buffer = []
buffer_length_s = 0.0
generation_completed = False

for new_audio in streamer:
if new_audio.shape[0] == 0:
break
# Start the audio generation (blocking call)
def generate_audio():
self.model.generate(**generation_kwargs)

audio_chunk = new_audio
audio_buffer.append(audio_chunk)
buffer_length_s += new_audio.shape[0] / sampling_rate
# Start the generation in a separate thread
generation_thread = Thread(target=generate_audio)
generation_thread.start()

# If we've buffered enough audio, start streaming the buffer
if buffer_length_s >= self.buffer_duration_s:
# Process the streamer in the main thread
while not generation_completed:
try:
new_audio = next(streamer)
if new_audio.shape[0] == 0:
# Streamer signaled completion
generation_completed = True
break

audio_chunk = new_audio
audio_buffer.append(audio_chunk)
buffer_length_s += new_audio.shape[0] / sampling_rate

if buffer_length_s >= self.buffer_duration_s:
# Buffering complete, start streaming
break
except StopIteration:
# No more audio data
generation_completed = True
break

# Queue the buffered audio chunks
for buffered_chunk in audio_buffer:
self.queue.put(buffered_chunk.tobytes())

# Stream audio in real-time as it's generated after buffering
for new_audio in streamer:
if new_audio.shape[0] == 0:
# Continue streaming the rest of the audio
while not generation_completed:
try:
new_audio = next(streamer)
if new_audio.shape[0] == 0:
# Streamer signaled completion
generation_completed = True
break
audio_chunk = new_audio
self.queue.put(audio_chunk.tobytes())
except StopIteration:
generation_completed = True
break
audio_chunk = new_audio
self.queue.put(audio_chunk.tobytes())

# Ensure the generation thread has completed
generation_thread.join()

def get_voices(self):
"""
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def parse_requirements(filename):
+ [requirements["elevenlabs"]]
+ [requirements["openai"]]
+ [requirements["gtts"]]
+ [requirements["coqui_tts"]]
+ [requirements["coqui_tts"]],
"system": base_requirements + [requirements["pyttsx3"]],
"azure": base_requirements + [requirements["azure-cognitiveservices-speech"]],
"elevenlabs": base_requirements + [requirements["elevenlabs"]],
Expand All @@ -53,7 +53,7 @@ def parse_requirements(filename):

setuptools.setup(
name="RealTimeTTS",
version="0.4.8",
version="0.4.9",
author="Kolja Beigel",
author_email="[email protected]",
description="Stream text into audio with an easy-to-use, highly configurable library delivering voice output with minimal latency.",
Expand Down
Binary file removed tests/synthesis_de_coqui.wav
Binary file not shown.
Binary file removed tests/synthesis_en_coqui.wav
Binary file not shown.
Binary file removed tests/synthesis_es_coqui.wav
Binary file not shown.
Binary file removed tests/synthesis_fr_coqui.wav
Binary file not shown.
Binary file removed tests/synthesis_it_coqui.wav
Binary file not shown.
Binary file removed tests/synthesis_ja_coqui.wav
Binary file not shown.
Binary file removed tests/synthesis_ko_coqui.wav
Binary file not shown.
Binary file removed tests/synthesis_pt_coqui.wav
Binary file not shown.
Binary file removed tests/synthesis_zh_coqui.wav
Binary file not shown.

0 comments on commit 6f82e8b

Please sign in to comment.