Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix and enable XTTS streaming #478

Open
wants to merge 9 commits into
base: alltalkbeta
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions script.py
Original file line number Diff line number Diff line change
Expand Up @@ -3309,6 +3309,7 @@ def on_load(request: gr.Request):
gen_lang = gr.Dropdown(
value=config.api_def.api_language,
choices=[
"auto",
"ar",
"zh",
"cs",
Expand Down
1 change: 1 addition & 0 deletions system/requirements/requirements_colab.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@ piper-tts; sys_platform == "linux"
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
langdetect>=1.0.9
1 change: 1 addition & 0 deletions system/requirements/requirements_standalone.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ fastapi==0.112.2
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
langdetect>=1.0.9
1 change: 1 addition & 0 deletions system/requirements/requirements_textgen.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ piper-phonemize==1.1.0; sys_platform == "darwin"
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
langdetect>=1.0.9
6 changes: 4 additions & 2 deletions system/tts_engines/f5tts/f5tts_settings_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def transcribe_files(model_name: str, use_cpu: bool = False, progress=gr.Progres
# dictionaries with the values provided as arguments, and save the updated settings back to the JSON file.
#
# You do not need to modify the function's logic or any other part of the code.
def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
Expand All @@ -155,6 +155,7 @@ def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
Expand Down Expand Up @@ -192,6 +193,7 @@ def f5tts_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
Expand Down Expand Up @@ -226,7 +228,7 @@ def f5tts_model_alltalk_settings(model_config_data):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown")
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown")
submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)

###########################################################################################
# Do not change this section apart from "TTS Engine Name" value to match your engine name #
Expand Down
5 changes: 5 additions & 0 deletions system/tts_engines/f5tts/help_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support

- **Stream Response Capability**
- Enables real-time streaming of generated speech output
- Reduces latency for faster feedback during synthesis
- Only available for engines and models that support Streaming

- **Temperature Control**
- Adjusts the variability in speech generation
Expand Down
16 changes: 15 additions & 1 deletion system/tts_engines/f5tts/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def __init__(self):
self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified.
self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified.
self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine
self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine
self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used)
self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation.
self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine
Expand Down Expand Up @@ -400,9 +401,16 @@ def scan_models_folder(self):
if model_dir.is_dir():
# First try to find model_*.safetensors files
model_files = list(model_dir.glob("model_*.safetensors"))
if not model_files:
# Try finding the pt model file as fallback
# If no model_*.safetensors found, try finding a .pt model file
model_files = list(model_dir.glob("model_*.pt"))
if not model_files:
# If no model_*.safetensors found, try any .safetensors file
model_files = list(model_dir.glob("*.safetensors"))
if not model_files:
# If no model_*.safetensors found, try any .pt file
model_files = list(model_dir.glob("*.pt"))

vocab_file = model_dir / "vocab.txt"
vocos_dir = model_dir / "vocos"
Expand Down Expand Up @@ -508,9 +516,15 @@ async def api_manual_load_model(self, model_name):

# Dynamically find the safetensors model file
model_files = list(model_dir.glob("model_*.safetensors"))
if not model_files:
# Try finding the pt model file as fallback
model_files = list(model_dir.glob("model_*.pt"))
if not model_files:
# Try finding any safetensors file as fallback
model_files = list(model_dir.glob("*.safetensors"))
if not model_files:
# Try finding any pt file as fallback
model_files = list(model_dir.glob("*.pt"))

if not model_files:
print(f"[{self.branding}ENG] \033[91mError\033[0m: No model's safetensors file was found in the F5-TTS models directory.")
Expand Down Expand Up @@ -1082,7 +1096,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena

generate_end_time = time.time()
generate_elapsed_time = generate_end_time - generate_start_time
print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")

if streaming:
with open(output_file, 'rb') as f:
Expand Down
1 change: 1 addition & 0 deletions system/tts_engines/f5tts/model_settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"def_character_voice": "female_01.wav",
"def_narrator_voice": "female_01.wav",
"deepspeed_enabled": false,
"streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 0.9,
"lowvram_enabled": true,
Expand Down
5 changes: 5 additions & 0 deletions system/tts_engines/parler/help_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support

- **Stream Response Capability**
- Enables real-time streaming of generated speech output
- Reduces latency for faster feedback during synthesis
- Only available for engines and models that support Streaming

- **Temperature Control**
- Adjusts the variability in speech generation
Expand Down
3 changes: 2 additions & 1 deletion system/tts_engines/parler/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def __init__(self):
self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified.
self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified.
self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine
self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine
self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used)
self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation.
self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine
Expand Down Expand Up @@ -503,7 +504,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
# ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
generate_end_time = time.time() # Record the end time to generate TTS
generate_elapsed_time = generate_end_time - generate_start_time
print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")
if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False:
await self.handle_lowvram_change()
self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again.
Expand Down
1 change: 1 addition & 0 deletions system/tts_engines/parler/model_settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"def_character_voice": "enthusiastic_female",
"def_narrator_voice": "enthusiastic_female",
"deepspeed_enabled": false,
"streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 1,
"lowvram_enabled": false,
Expand Down
6 changes: 4 additions & 2 deletions system/tts_engines/parler/parler_settings_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def parler_voices_file_list():
# dictionaries with the values provided as arguments, and save the updated settings back to the JSON file.
#
# You do not need to modify the function's logic or any other part of the code.
def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
Expand All @@ -67,6 +67,7 @@ def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr,
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
Expand Down Expand Up @@ -104,6 +105,7 @@ def parler_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
Expand Down Expand Up @@ -132,7 +134,7 @@ def parler_model_alltalk_settings(model_config_data):
with gr.Row():
submit_button = gr.Button("Update Settings")
output_message = gr.Textbox(label="Output Message", interactive=False, show_label=False)
submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
with gr.Accordion("HELP - 🔊 Understanding TTS Engine Default Settings Page", open=False):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS, elem_classes="custom-markdown")
Expand Down
5 changes: 5 additions & 0 deletions system/tts_engines/piper/help_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support

- **Stream Response Capability**
- Enables real-time streaming of generated speech output
- Reduces latency for faster feedback during synthesis
- Only available for engines and models that support Streaming

- **Temperature Control**
- Adjusts the variability in speech generation
Expand Down
Loading