From e72a7bf5c9c331f69e852d50e39edc471192c105 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Fri, 3 Jan 2025 17:50:52 +0100 Subject: [PATCH 1/7] Add initial support for pickletensor models to F5-TTS * Tested with @RASPAUDIO french model available here : https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced --- system/tts_engines/f5tts/model_engine.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py index edb57b5..a6f62e6 100644 --- a/system/tts_engines/f5tts/model_engine.py +++ b/system/tts_engines/f5tts/model_engine.py @@ -400,9 +400,16 @@ def scan_models_folder(self): if model_dir.is_dir(): # First try to find model_*.safetensors files model_files = list(model_dir.glob("model_*.safetensors")) + if not model_files: + # Try finding the pt model file as fallback + # If no model_*.safetensors found, try finding a .pt model file + model_files = list(model_dir.glob("model_*.pt")) if not model_files: # If no model_*.safetensors found, try any .safetensors file model_files = list(model_dir.glob("*.safetensors")) + if not model_files: + # If no model_*.safetensors found, try any .pt file + model_files = list(model_dir.glob("*.pt")) vocab_file = model_dir / "vocab.txt" vocos_dir = model_dir / "vocos" @@ -508,9 +515,15 @@ async def api_manual_load_model(self, model_name): # Dynamically find the safetensors model file model_files = list(model_dir.glob("model_*.safetensors")) + if not model_files: + # Try finding the pt model file as fallback + model_files = list(model_dir.glob("model_*.pt")) if not model_files: # Try finding any safetensors file as fallback model_files = list(model_dir.glob("*.safetensors")) + if not model_files: + # Try finding any pt file as fallback + model_files = list(model_dir.glob("*.pt")) if not model_files: print(f"[{self.branding}ENG] \033[91mError\033[0m: No model's safetensors file was found in the F5-TTS models directory.") From ecb15200dc3a4c6408089a72f2467cb839031db8 Mon Sep 17 00:00:00 2001 From: Ilyas Date: Sat, 4 Jan 2025 00:38:52 +0100 Subject: [PATCH 2/7] Add language auto-detection * adds langdetect as requirement for colab, standalone and textgen * adds "auto" to the language dropdown in the Advanced Engine/Model Settings panel * replace the hardcoded "en" by "auto" when called by the OpenAI compatible Speech API --- script.py | 1 + system/requirements/requirements_colab.txt | 1 + .../requirements/requirements_standalone.txt | 1 + system/requirements/requirements_textgen.txt | 1 + tts_server.py | 26 +++++++++++++++++-- 5 files changed, 28 insertions(+), 2 deletions(-) diff --git a/script.py b/script.py index 2c0a93c..cd3605f 100644 --- a/script.py +++ b/script.py @@ -3309,6 +3309,7 @@ def on_load(request: gr.Request): gen_lang = gr.Dropdown( value=config.api_def.api_language, choices=[ + "auto", "ar", "zh", "cs", diff --git a/system/requirements/requirements_colab.txt b/system/requirements/requirements_colab.txt index 5d978fc..a3344af 100644 --- a/system/requirements/requirements_colab.txt +++ b/system/requirements/requirements_colab.txt @@ -46,3 +46,4 @@ piper-tts; sys_platform == "linux" plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/system/requirements/requirements_standalone.txt b/system/requirements/requirements_standalone.txt index dfc3c0e..3c47c36 100644 --- a/system/requirements/requirements_standalone.txt +++ b/system/requirements/requirements_standalone.txt @@ -36,3 +36,4 @@ fastapi==0.112.2 plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/system/requirements/requirements_textgen.txt b/system/requirements/requirements_textgen.txt index 2007867..f89b30e 100644 --- a/system/requirements/requirements_textgen.txt +++ b/system/requirements/requirements_textgen.txt @@ -33,3 +33,4 @@ piper-phonemize==1.1.0; sys_platform == "darwin" plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/tts_server.py b/tts_server.py index 07630ee..aabdb6a 100644 --- a/tts_server.py +++ b/tts_server.py @@ -39,9 +39,13 @@ import numpy as np import soundfile as sf import librosa +from langdetect import detect, DetectorFactory +from langdetect.lang_detect_exception import LangDetectException from config import AlltalkConfig, AlltalkTTSEnginesConfig logging.disable(logging.WARNING) +DetectorFactory.seed = 0 # Ensure deterministic behavior + ######################################################################################## # START-UP # Silence RVC warning about torch.nn.utils.weight_norm even though not used # ######################################################################################## @@ -938,6 +942,9 @@ async def generate_audio(text, voice, language, temperature, repetition_penalty, print_message("each TTS Engine in the 'Engine Information' section of the Gradio interface.", "warning", "GEN") raise ValueError("Streaming not supported by current TTS engine") + if language == "auto": + language = detect_language(text) + response = model_engine.generate_tts(text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming) if streaming: @@ -1138,7 +1145,7 @@ async def openai_tts_generate(request: Request): else: print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS") - await generate_audio(cleaned_string, mapped_voice, "en", model_engine.temperature_set, + await generate_audio(cleaned_string, mapped_voice, "auto", model_engine.temperature_set, model_engine.repetitionpenalty_set, speed, model_engine.pitch_set, output_file_path, streaming=False) @@ -1605,7 +1612,7 @@ class JSONInput(BaseModel): rvcnarrator_voice_gen: str = Field(..., description="rvcnarrator_voice_gen needs to be the name of a valid pth file in the 'folder\\file.pth' format or the word 'Disabled'.") rvcnarrator_pitch: float = Field(..., description="RVC Narrator pitch needs to be a number between -24 and 24") text_not_inside: str = Field(..., pattern="^(character|narrator|silent)$", description="text_not_inside needs to be 'character', 'narrator' or 'silent'.") - language: str = Field(..., pattern="^(ar|zh-cn|zh|cs|nl|en|fr|de|hu|hi|it|ja|ko|pl|pt|ru|es|tr)$", description="language needs to be one of the following: ar, zh-cn, zh, cs, nl, en, fr, de, hu, hi, it, ja, ko, pl, pt, ru, es, tr.") + language: str = Field(..., pattern="^(auto|ar|zh-cn|zh|cs|nl|en|fr|de|hu|hi|it|ja|ko|pl|pt|ru|es|tr)$", description="language needs to be one of the following: auto, ar, zh-cn, zh, cs, nl, en, fr, de, hu, hi, it, ja, ko, pl, pt, ru, es, tr.") output_file_name: str = Field(..., pattern="^[a-zA-Z0-9_]+$", description="output_file_name needs to be the name without any special characters or file extension, e.g., 'filename'.") output_file_timestamp: bool = Field(..., description="output_file_timestamp needs to be true or false.") autoplay: bool = Field(..., description="autoplay needs to be a true or false value.") @@ -2098,6 +2105,21 @@ async def tts_finalize_output(audio_files: List[Path], params: dict) -> Tuple[Pa return output_file_path, output_file_url, output_cache_url +def detect_language(text: str) -> str: + """ + Detect the language of the given text. + + :param text: Text to analyze. + :return: Detected language code (e.g., 'en', 'fr'). + """ + try: + detected_lang = detect(text) + print_message(f"Detected language: {detected_lang}", "debug", "LANG_DETECTION") + return detected_lang + except LangDetectException as e: + print_message(f"Language detection error: {str(e)}", "error", "LANG_DETECTION") + raise ValueError("Could not detect language") + @app.post("/api/tts-generate", response_class=JSONResponse) async def apifunction_generate_tts_standard( text_input: str = Form(...), From 72ae93d6cb1adc5a4979571f141d95d056c08ae4 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 14:01:42 +0100 Subject: [PATCH 3/7] Add streaming flag on tts settings --- system/tts_engines/f5tts/f5tts_settings_page.py | 6 ++++-- system/tts_engines/f5tts/help_content.py | 5 +++++ system/tts_engines/f5tts/model_settings.json | 1 + system/tts_engines/parler/help_content.py | 5 +++++ system/tts_engines/parler/model_settings.json | 1 + system/tts_engines/parler/parler_settings_page.py | 6 ++++-- system/tts_engines/piper/help_content.py | 5 +++++ system/tts_engines/piper/model_settings.json | 1 + system/tts_engines/piper/piper_settings_page.py | 6 ++++-- system/tts_engines/template-tts-engine/help_content.py | 5 +++++ .../template-tts-engine/model_settings.json | 1 + .../template-tts-engine/modelname_settings_page.py | 6 ++++-- system/tts_engines/vits/help_content.py | 5 +++++ system/tts_engines/vits/model_settings.json | 1 + system/tts_engines/vits/vits_settings_page.py | 6 ++++-- system/tts_engines/xtts/help_content.py | 10 ++++++++++ system/tts_engines/xtts/model_engine.py | 1 + system/tts_engines/xtts/model_settings.json | 1 + system/tts_engines/xtts/xtts_settings_page.py | 6 ++++-- 19 files changed, 66 insertions(+), 12 deletions(-) diff --git a/system/tts_engines/f5tts/f5tts_settings_page.py b/system/tts_engines/f5tts/f5tts_settings_page.py index f58802d..5b6e03d 100644 --- a/system/tts_engines/f5tts/f5tts_settings_page.py +++ b/system/tts_engines/f5tts/f5tts_settings_page.py @@ -140,7 +140,7 @@ def transcribe_files(model_name: str, use_cpu: bool = False, progress=gr.Progres # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -155,6 +155,7 @@ def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -192,6 +193,7 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -226,7 +228,7 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/f5tts/help_content.py b/system/tts_engines/f5tts/help_content.py index d339c25..9163a5e 100644 --- a/system/tts_engines/f5tts/help_content.py +++ b/system/tts_engines/f5tts/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/f5tts/model_settings.json b/system/tts_engines/f5tts/model_settings.json index 034d0a4..80865ec 100644 --- a/system/tts_engines/f5tts/model_settings.json +++ b/system/tts_engines/f5tts/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "female_01.wav", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 0.9, "lowvram_enabled": true, diff --git a/system/tts_engines/parler/help_content.py b/system/tts_engines/parler/help_content.py index f111302..592a0dd 100644 --- a/system/tts_engines/parler/help_content.py +++ b/system/tts_engines/parler/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/parler/model_settings.json b/system/tts_engines/parler/model_settings.json index 580f5cb..5d19066 100644 --- a/system/tts_engines/parler/model_settings.json +++ b/system/tts_engines/parler/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "enthusiastic_female", "def_narrator_voice": "enthusiastic_female", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, diff --git a/system/tts_engines/parler/parler_settings_page.py b/system/tts_engines/parler/parler_settings_page.py index e2b1fc8..ef354a6 100644 --- a/system/tts_engines/parler/parler_settings_page.py +++ b/system/tts_engines/parler/parler_settings_page.py @@ -52,7 +52,7 @@ def parler_voices_file_list(): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -67,6 +67,7 @@ def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -104,6 +105,7 @@ def parler_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -132,7 +134,7 @@ def parler_model_alltalk_settings(model_config_data): with gr.Row(): submit_button = gr.Button("Update Settings") output_message = gr.Textbox(label="Output Message", interactive=False, show_label=False) - submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) with gr.Accordion("HELP - 🔊 Understanding TTS Engine Default Settings Page", open=False): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS, elem_classes="custom-markdown") diff --git a/system/tts_engines/piper/help_content.py b/system/tts_engines/piper/help_content.py index 4f32cb7..417fc67 100644 --- a/system/tts_engines/piper/help_content.py +++ b/system/tts_engines/piper/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/piper/model_settings.json b/system/tts_engines/piper/model_settings.json index d934545..182d892 100644 --- a/system/tts_engines/piper/model_settings.json +++ b/system/tts_engines/piper/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "en_US-ljspeech-high.onnx", "def_narrator_voice": "en_US-ljspeech-high.onnx", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, diff --git a/system/tts_engines/piper/piper_settings_page.py b/system/tts_engines/piper/piper_settings_page.py index fa8708d..c20df0f 100644 --- a/system/tts_engines/piper/piper_settings_page.py +++ b/system/tts_engines/piper/piper_settings_page.py @@ -197,7 +197,7 @@ def download_language_pack(lang_code, progress=gr.Progress()): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -212,6 +212,7 @@ def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -249,6 +250,7 @@ def piper_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -283,7 +285,7 @@ def piper_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/template-tts-engine/help_content.py b/system/tts_engines/template-tts-engine/help_content.py index 277f974..74bbf26 100644 --- a/system/tts_engines/template-tts-engine/help_content.py +++ b/system/tts_engines/template-tts-engine/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/template-tts-engine/model_settings.json b/system/tts_engines/template-tts-engine/model_settings.json index a770754..edb78c3 100644 --- a/system/tts_engines/template-tts-engine/model_settings.json +++ b/system/tts_engines/template-tts-engine/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "male_01.wav", "deepspeed_enabled": true, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, diff --git a/system/tts_engines/template-tts-engine/modelname_settings_page.py b/system/tts_engines/template-tts-engine/modelname_settings_page.py index 8fb2b4f..dc0a337 100644 --- a/system/tts_engines/template-tts-engine/modelname_settings_page.py +++ b/system/tts_engines/template-tts-engine/modelname_settings_page.py @@ -48,7 +48,7 @@ def xtts_voices_file_list(): # # You do not need to modify the function's logic or any other part of the code. -def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -64,6 +64,7 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -104,6 +105,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -138,7 +140,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/vits/help_content.py b/system/tts_engines/vits/help_content.py index 21d48c6..1bcb19f 100644 --- a/system/tts_engines/vits/help_content.py +++ b/system/tts_engines/vits/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/vits/model_settings.json b/system/tts_engines/vits/model_settings.json index f50609e..3dffcf7 100644 --- a/system/tts_engines/vits/model_settings.json +++ b/system/tts_engines/vits/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "p225", "def_narrator_voice": "p226", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": true, diff --git a/system/tts_engines/vits/vits_settings_page.py b/system/tts_engines/vits/vits_settings_page.py index 3232698..bd76f99 100644 --- a/system/tts_engines/vits/vits_settings_page.py +++ b/system/tts_engines/vits/vits_settings_page.py @@ -231,7 +231,7 @@ def download_language_pack(lang_code, progress=gr.Progress()): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -246,6 +246,7 @@ def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -283,6 +284,7 @@ def vits_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -317,7 +319,7 @@ def vits_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/xtts/help_content.py b/system/tts_engines/xtts/help_content.py index cc03f8b..f6cc68e 100644 --- a/system/tts_engines/xtts/help_content.py +++ b/system/tts_engines/xtts/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation @@ -310,6 +315,11 @@ class AllTalkHelpContent: - Requires NVIDIA GPU with CUDA support - 2-3x speed improvement in generation - Recommended when available + + - **Streaming Support** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Multi-Language Support** - Clone voices across multiple languages diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index c450488..10ad2cc 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -350,6 +350,7 @@ def __init__(self): self.def_character_voice = model_settings_file["settings"]["def_character_voice"] self.def_narrator_voice = model_settings_file["settings"]["def_narrator_voice"] self.deepspeed_enabled = model_settings_file["settings"]["deepspeed_enabled"] + self.streaming_enabled = model_settings_file["settings"]["streaming_enabled"] self.engine_installed = model_settings_file["settings"]["engine_installed"] self.generationspeed_set = model_settings_file["settings"]["generationspeed_set"] self.lowvram_enabled = model_settings_file["settings"]["lowvram_enabled"] diff --git a/system/tts_engines/xtts/model_settings.json b/system/tts_engines/xtts/model_settings.json index b680cfa..a56af38 100644 --- a/system/tts_engines/xtts/model_settings.json +++ b/system/tts_engines/xtts/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "male_01.wav", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, diff --git a/system/tts_engines/xtts/xtts_settings_page.py b/system/tts_engines/xtts/xtts_settings_page.py index cb3f7e7..ae2ef32 100644 --- a/system/tts_engines/xtts/xtts_settings_page.py +++ b/system/tts_engines/xtts/xtts_settings_page.py @@ -49,7 +49,7 @@ def xtts_voices_file_list(): # # You do not need to modify the function's logic or any other part of the code. -def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -65,6 +65,7 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -105,6 +106,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -139,7 +141,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # From f7c7800dce07defa4d5267ed9f908fa3e2b883d3 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 14:03:08 +0100 Subject: [PATCH 4/7] Use streaming flag within OpenAI Speech API --- tts_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tts_server.py b/tts_server.py index aabdb6a..11e6385 100644 --- a/tts_server.py +++ b/tts_server.py @@ -1147,7 +1147,7 @@ async def openai_tts_generate(request: Request): await generate_audio(cleaned_string, mapped_voice, "auto", model_engine.temperature_set, model_engine.repetitionpenalty_set, speed, model_engine.pitch_set, - output_file_path, streaming=False) + output_file_path, model_engine.streaming_enabled) print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS") From 7b79c301a5873e396b060584085c3b9335890cad Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 14:23:45 +0100 Subject: [PATCH 5/7] Add streaming status to logs --- system/tts_engines/f5tts/model_engine.py | 3 ++- system/tts_engines/parler/model_engine.py | 3 ++- system/tts_engines/piper/model_engine.py | 3 ++- system/tts_engines/template-tts-engine/model_engine.py | 3 ++- system/tts_engines/vits/model_engine.py | 2 +- system/tts_engines/xtts/model_engine.py | 2 +- 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py index a6f62e6..c08fe00 100644 --- a/system/tts_engines/f5tts/model_engine.py +++ b/system/tts_engines/f5tts/model_engine.py @@ -144,6 +144,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -1095,7 +1096,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if streaming: with open(output_file, 'rb') as f: diff --git a/system/tts_engines/parler/model_engine.py b/system/tts_engines/parler/model_engine.py index 8338774..27c7d21 100644 --- a/system/tts_engines/parler/model_engine.py +++ b/system/tts_engines/parler/model_engine.py @@ -91,6 +91,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -503,7 +504,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() # Record the end time to generate TTS generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False: await self.handle_lowvram_change() self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again. diff --git a/system/tts_engines/piper/model_engine.py b/system/tts_engines/piper/model_engine.py index 438e7be..5d86ff1 100644 --- a/system/tts_engines/piper/model_engine.py +++ b/system/tts_engines/piper/model_engine.py @@ -90,6 +90,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -468,5 +469,5 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") self.tts_generating_lock = False diff --git a/system/tts_engines/template-tts-engine/model_engine.py b/system/tts_engines/template-tts-engine/model_engine.py index 5f78bb6..48f67c2 100644 --- a/system/tts_engines/template-tts-engine/model_engine.py +++ b/system/tts_engines/template-tts-engine/model_engine.py @@ -92,6 +92,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -417,5 +418,5 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") self.tts_generating_lock = False diff --git a/system/tts_engines/vits/model_engine.py b/system/tts_engines/vits/model_engine.py index 9e514ba..adfb48e 100644 --- a/system/tts_engines/vits/model_engine.py +++ b/system/tts_engines/vits/model_engine.py @@ -638,7 +638,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() # Record the end time to generate TTS generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False: await self.handle_lowvram_change() self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again. diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index 10ad2cc..94aceb9 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -1155,7 +1155,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # Standard output message (not debug) self.print_message( - f"\033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m", + f"\033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m", message_type="standard" ) From 0c925196221febf2500444977355d03a40aaa084 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 18:24:35 +0100 Subject: [PATCH 6/7] Fix XTTS streaming mode --- system/tts_engines/xtts/model_engine.py | 149 ++++++++++++------------ tts_server.py | 99 ++++++++++------ 2 files changed, 134 insertions(+), 114 deletions(-) diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index 94aceb9..ad5a69e 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -24,7 +24,6 @@ Note: You can add new functions, just DONT remove the functions that are already there, even if they are doing nothing as `tts_server.py` will still look for their existance and fail if they are missing. """ - ######################################## # Default imports # Do not change this # ######################################## @@ -968,7 +967,44 @@ async def handle_tts_method_change(self, tts_method): self.print_message(f"\033[94mModel Loadtime: \033[93m{generate_elapsed_time:.2f}\033[94m seconds\033[0m") return True - async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming): + async def prepare_voice_inputs(self, voice): + """Prepares latents and embeddings based on the voice input.""" + gpt_cond_latent = None + speaker_embedding = None + + if voice.startswith('latent:'): + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._load_latents(voice) + + elif voice.startswith('voiceset:'): + voice_set = voice.replace("voiceset:", "") + voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set) + self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts") + + wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav")) + if not wavs_files: + self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error") + raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}") + + if len(wavs_files) > 5: + wavs_files = random.sample(wavs_files, 5) + self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts") + + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) + + else: + normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice)) + wavs_files = [normalized_path] + self.print_message(f"Using single voice sample: {normalized_path}", message_type="debug_tts") + + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) + + return gpt_cond_latent, speaker_embedding + + async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, + streaming): """ Generate speech from text using the XTTS model. @@ -1018,71 +1054,33 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena generate_start_time = time.time() try: - # Voice input processing - self.print_message(f"Processing voice input: {voice}", message_type="debug_tts") - gpt_cond_latent = None - speaker_embedding = None - - # Handle different voice types - if voice.startswith('latent:'): - if self.current_model_loaded.startswith("xtts"): - gpt_cond_latent, speaker_embedding = self._load_latents(voice) - - elif voice.startswith('voiceset:'): - voice_set = voice.replace("voiceset:", "") - voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set) - self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts") - - wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav")) - if not wavs_files: - self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error") - raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}") - - if len(wavs_files) > 5: - wavs_files = random.sample(wavs_files, 5) - self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts") - - if self.current_model_loaded.startswith("xtts"): - self.print_message("Generating conditioning latents from voice set", message_type="debug_tts") - gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) - - else: - normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice)) - wavs_files = [normalized_path] - self.print_message(f"Using single voice sample: {normalized_path}", message_type="debug_tts") - - if self.current_model_loaded.startswith("xtts"): - self.print_message("Generating conditioning latents from single sample", message_type="debug_tts") - gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) - - # Generate speech + # Preparation of latents and embeddings + gpt_cond_latent, speaker_embedding = await self.prepare_voice_inputs(voice) + + common_args = { + "text": text, + "language": language, + "gpt_cond_latent": gpt_cond_latent, + "speaker_embedding": speaker_embedding, + "temperature": float(temperature), + "length_penalty": float(self.model.config.length_penalty), + "repetition_penalty": float(repetition_penalty), + "top_k": int(self.model.config.top_k), + "top_p": float(self.model.config.top_p), + "speed": float(speed), + "enable_text_splitting": True + } + + self.print_message("Generation settings:", message_type="debug_tts_variables") + self.print_message(f"├─ Temperature: {temperature}", message_type="debug_tts_variables") + self.print_message(f"├─ Speed: {speed}", message_type="debug_tts_variables") + self.print_message(f"├─ Language: {language}", message_type="debug_tts_variables") + self.print_message(f"└─ Text length: {len(text)} characters", message_type="debug_tts_variables") + + # Handle streaming vs non-streaming if self.current_model_loaded.startswith("xtts"): - self.print_message(f"Generating speech for text: {text}", message_type="debug_tts") - - common_args = { - "text": text, - "language": language, - "gpt_cond_latent": gpt_cond_latent, - "speaker_embedding": speaker_embedding, - "temperature": float(temperature), - "length_penalty": float(self.model.config.length_penalty), - "repetition_penalty": float(repetition_penalty), - "top_k": int(self.model.config.top_k), - "top_p": float(self.model.config.top_p), - "speed": float(speed), - "enable_text_splitting": True - } - - self.print_message("Generation settings:", message_type="debug_tts_variables") - self.print_message(f"├─ Temperature: {temperature}", message_type="debug_tts_variables") - self.print_message(f"├─ Speed: {speed}", message_type="debug_tts_variables") - self.print_message(f"├─ Language: {language}", message_type="debug_tts_variables") - self.print_message(f"└─ Text length: {len(text)} characters", message_type="debug_tts_variables") - - # Handle streaming vs non-streaming if streaming: self.print_message("Starting streaming generation", message_type="debug_tts") - self.print_message(f"Using streaming-based generation and files {wavs_files}") output = self.model.inference_stream(**common_args, stream_chunk_size=20) file_chunks = [] @@ -1102,7 +1100,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena self.tts_generating_lock = False break - self.print_message(f"Processing chunk {i+1}", message_type="debug_tts") + self.print_message(f"Processing chunk {i + 1}", message_type="debug_tts") file_chunks.append(chunk) if isinstance(chunk, list): chunk = torch.cat(chunk, dim=0) @@ -1119,9 +1117,9 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena elif self.current_model_loaded.startswith("apitts"): if streaming: - raise ValueError("Streaming is only supported in XTTSv2 local mode") + raise ValueError("Streaming is not supported in APITTS mode") # Common arguments for both error and normal cases - common_args = { + api_args = { "file_path": output_file, "language": language, "temperature": temperature, @@ -1129,23 +1127,20 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena "repetition_penalty": repetition_penalty, "top_k": self.model.config.top_k, "top_p": self.model.config.top_p, - "speed": speed - } - if voice.startswith('latent:'): + "speed": speed, + } + + if voice.startswith("latent:"): self.print_message("API TTS method does not support latent files - Please use an audio reference file", message_type="error") self.model.tts_to_file( text="The API TTS method only supports audio files not latents. Please select an audio reference file instead.", speaker="Ana Florence", - **common_args + **api_args, ) else: self.print_message("Using API-based generation", message_type="debug_tts") - self.model.tts_to_file( - text=text, - speaker_wav=wavs_files, - **common_args - ) - + self.model.tts_to_file(text=text, speaker_wav=[voice], **api_args) + self.print_message(f"API generation completed, saved to: {output_file}", message_type="debug_tts") finally: diff --git a/tts_server.py b/tts_server.py index 11e6385..fdcaa22 100644 --- a/tts_server.py +++ b/tts_server.py @@ -945,22 +945,34 @@ async def generate_audio(text, voice, language, temperature, repetition_penalty, if language == "auto": language = detect_language(text) - response = model_engine.generate_tts(text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming) - + # Streaming mode if streaming: - async def stream_response(): + print_message("Streaming mode enabled", "debug", "TTS") + response = model_engine.generate_tts( + text, voice, language, temperature, repetition_penalty, speed, pitch, output_file=None, streaming=True + ) + + async def stream_audio(): try: async for chunk in response: yield chunk except Exception as e: print_message(f"Error during streaming audio generation: {str(e)}", "error", "GEN") raise - return stream_response() + + return stream_audio() + + # Non-streaming mode + print_message("Non-streaming mode enabled", "debug", "TTS") + response = model_engine.generate_tts( + text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming=False + ) + try: async for _ in response: pass except Exception as e: - print_message(f"Error during audio generation: {str(e)}", "error", "GEN") + print_message(f"Error during audio generation: {str(e)}", "error", "TTS") raise ########################### @@ -1110,22 +1122,24 @@ async def openai_tts_generate(request: Request): # Extract and validate parameters input_text = json_data["input"] voice = json_data["voice"] - response_format = json_data.get("response_format", "wav").lower() speed = json_data.get("speed", 1.0) print_message(f"Input text: {input_text}", "debug_openai", "TTS") print_message(f"Voice: {voice}", "debug_openai", "TTS") print_message(f"Speed: {speed}", "debug_openai", "TTS") + # Load current model engine configuration + current_model_engine = tts_class() + # Process text and map voice cleaned_string = html.unescape(standard_filtering(input_text)) voice_mapping = { - "alloy": model_engine.openai_alloy, - "echo": model_engine.openai_echo, - "fable": model_engine.openai_fable, - "nova": model_engine.openai_nova, - "onyx": model_engine.openai_onyx, - "shimmer": model_engine.openai_shimmer + "alloy": current_model_engine.openai_alloy, + "echo": current_model_engine.openai_echo, + "fable": current_model_engine.openai_fable, + "nova": current_model_engine.openai_nova, + "onyx": current_model_engine.openai_onyx, + "shimmer": current_model_engine.openai_shimmer } mapped_voice = voice_mapping.get(voice) @@ -1135,37 +1149,48 @@ async def openai_tts_generate(request: Request): print_message(f"Mapped voice: {mapped_voice}", "debug_openai", "TTS") - # Generate audio - unique_id = uuid.uuid4() - timestamp = int(time.time()) - output_file_path = f'{this_dir / config.get_output_directory() / f"openai_output_{unique_id}_{timestamp}.{model_engine.audio_format}"}' - - if config.debugging.debug_fullttstext: - print_message(cleaned_string, component="TTS") + if current_model_engine.streaming_enabled: + audio_stream = await generate_audio( + cleaned_string, mapped_voice, "auto", current_model_engine.temperature_set, + float(str(current_model_engine.repetitionpenalty_set).replace(',', '.')), speed, current_model_engine.pitch_set, + output_file=None, streaming=True + ) + return StreamingResponse(audio_stream, media_type="audio/wav") else: - print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS") + # Generate audio + unique_id = uuid.uuid4() + timestamp = int(time.time()) + output_file_path = f'{this_dir / config.get_output_directory() / f"openai_output_{unique_id}_{timestamp}.{current_model_engine.audio_format}"}' + response_format = json_data.get("response_format", "wav").lower() + + if config.debugging.debug_fullttstext: + print_message(cleaned_string, component="TTS") + else: + print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS") - await generate_audio(cleaned_string, mapped_voice, "auto", model_engine.temperature_set, - model_engine.repetitionpenalty_set, speed, model_engine.pitch_set, - output_file_path, model_engine.streaming_enabled) + await generate_audio( + cleaned_string, mapped_voice, "auto", current_model_engine.temperature_set, + float(str(current_model_engine.repetitionpenalty_set).replace(',', '.')), speed, current_model_engine.pitch_set, + output_file_path, streaming=False + ) - print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS") + print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS") - # Handle RVC processing - if config.rvc_settings.rvc_enabled: - if config.rvc_settings.rvc_char_model_file.lower() in ["disabled", "disable"]: - print_message("Pass rvccharacter_voice_gen", "debug_openai", "TTS") - else: - print_message("send to rvc", "debug_openai", "TTS") - pth_path = this_dir / "models" / "rvc_voices" / config.rvc_settings.rvc_char_model_file - pitch = config.rvc_settings.pitch - run_rvc(output_file_path, pth_path, pitch, infer_pipeline) + # Handle RVC processing + if config.rvc_settings.rvc_enabled: + if config.rvc_settings.rvc_char_model_file.lower() in ["disabled", "disable"]: + print_message("Pass rvccharacter_voice_gen", "debug_openai", "TTS") + else: + print_message("send to rvc", "debug_openai", "TTS") + pth_path = this_dir / "models" / "rvc_voices" / config.rvc_settings.rvc_char_model_file + pitch = config.rvc_settings.pitch + run_rvc(output_file_path, pth_path, pitch, infer_pipeline) - transcoded_file_path = await transcode_for_openai(output_file_path, response_format) - print_message(f"Audio transcoded to: {transcoded_file_path}", "debug_openai", "TTS") + transcoded_file_path = await transcode_for_openai(output_file_path, response_format) + print_message(f"Audio transcoded to: {transcoded_file_path}", "debug_openai", "TTS") - response = FileResponse(transcoded_file_path, media_type=f"audio/{response_format}", - filename=f"output.{response_format}") + return FileResponse(transcoded_file_path, media_type=f"audio/{response_format}", + filename=f"output.{response_format}") except ValueError as e: print_message(f"Value error occurred: {str(e)}", "error", "TTS") From c83faf9e386c04cdb4ff333a8f0e490399f97712 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sun, 5 Jan 2025 18:15:40 +0100 Subject: [PATCH 7/7] Remove debug purpose logs --- tts_server.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tts_server.py b/tts_server.py index fdcaa22..a9f3367 100644 --- a/tts_server.py +++ b/tts_server.py @@ -947,7 +947,6 @@ async def generate_audio(text, voice, language, temperature, repetition_penalty, # Streaming mode if streaming: - print_message("Streaming mode enabled", "debug", "TTS") response = model_engine.generate_tts( text, voice, language, temperature, repetition_penalty, speed, pitch, output_file=None, streaming=True ) @@ -963,7 +962,6 @@ async def stream_audio(): return stream_audio() # Non-streaming mode - print_message("Non-streaming mode enabled", "debug", "TTS") response = model_engine.generate_tts( text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming=False )