-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathgradio_interface.py
184 lines (155 loc) · 5.76 KB
/
gradio_interface.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
Kokoro-TTS Local Generator
-------------------------
A Gradio interface for the Kokoro-TTS-Local text-to-speech system.
Supports multiple voices and audio formats, with cross-platform compatibility.
Key Features:
- Multiple voice models support (26+ voices)
- Real-time generation with progress logging
- WAV, MP3, and AAC output formats
- Network sharing capabilities
- Cross-platform compatibility (Windows, macOS, Linux)
Dependencies:
- kokoro: Official Kokoro TTS library
- gradio: Web interface framework
- soundfile: Audio file handling
- pydub: Audio format conversion
"""
import gradio as gr
import os
import sys
import platform
from datetime import datetime
import shutil
from pathlib import Path
import soundfile as sf
from pydub import AudioSegment
import torch
import numpy as np
from models import (
list_available_voices, build_model,
generate_speech
)
# Global configuration
CONFIG_FILE = "tts_config.json" # Stores user preferences and paths
DEFAULT_OUTPUT_DIR = "outputs" # Directory for generated audio files
SAMPLE_RATE = 24000 # Updated from 22050 to match new model
# Initialize model globally
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = None
def get_available_voices():
"""Get list of available voice models."""
try:
# Initialize model to trigger voice downloads
global model
if model is None:
print("Initializing model and downloading voices...")
model = build_model(None, device)
voices = list_available_voices()
if not voices:
print("No voices found after initialization. Attempting to download...")
download_voice_files() # Try downloading again
voices = list_available_voices()
print("Available voices:", voices)
return voices
except Exception as e:
print(f"Error getting voices: {e}")
return []
def convert_audio(input_path: str, output_path: str, format: str):
"""Convert audio to specified format."""
try:
if format == "wav":
return input_path
audio = AudioSegment.from_wav(input_path)
if format == "mp3":
audio.export(output_path, format="mp3", bitrate="192k")
elif format == "aac":
audio.export(output_path, format="aac", bitrate="192k")
return output_path
except Exception as e:
print(f"Error converting audio: {e}")
return input_path
def generate_tts_with_logs(voice_name, text, format):
"""Generate TTS audio with progress logging."""
global model
try:
# Initialize model if needed
if model is None:
print("Initializing model...")
model = build_model(None, device)
# Create output directory
os.makedirs(DEFAULT_OUTPUT_DIR, exist_ok=True)
# Generate base filename from text
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_name = f"tts_{timestamp}"
wav_path = os.path.join(DEFAULT_OUTPUT_DIR, f"{base_name}.wav")
# Generate speech
print(f"\nGenerating speech for: '{text}'")
print(f"Using voice: {voice_name}")
generator = model(text, voice=f"voices/{voice_name}.pt", speed=1.0, split_pattern=r'\n+')
all_audio = []
for gs, ps, audio in generator:
if audio is not None:
if isinstance(audio, np.ndarray):
audio = torch.from_numpy(audio).float()
all_audio.append(audio)
print(f"Generated segment: {gs}")
print(f"Phonemes: {ps}")
if not all_audio:
raise Exception("No audio generated")
# Combine audio segments and save
final_audio = torch.cat(all_audio, dim=0)
sf.write(wav_path, final_audio.numpy(), SAMPLE_RATE)
# Convert to requested format if needed
if format != "wav":
output_path = os.path.join(DEFAULT_OUTPUT_DIR, f"{base_name}.{format}")
return convert_audio(wav_path, output_path, format)
return wav_path
except Exception as e:
print(f"Error generating speech: {e}")
import traceback
traceback.print_exc()
return None
def create_interface(server_name="0.0.0.0", server_port=7860):
"""Create and launch the Gradio interface."""
# Get available voices
voices = get_available_voices()
if not voices:
print("No voices found! Please check the voices directory.")
return
# Create interface
with gr.Blocks(title="Kokoro TTS Generator") as interface:
gr.Markdown("# Kokoro TTS Generator")
with gr.Row():
with gr.Column():
voice = gr.Dropdown(
choices=voices,
value=voices[0] if voices else None,
label="Voice"
)
text = gr.Textbox(
lines=3,
placeholder="Enter text to convert to speech...",
label="Text"
)
format = gr.Radio(
choices=["wav", "mp3", "aac"],
value="wav",
label="Output Format"
)
generate = gr.Button("Generate Speech")
with gr.Column():
output = gr.Audio(label="Generated Audio")
generate.click(
fn=generate_tts_with_logs,
inputs=[voice, text, format],
outputs=output
)
# Launch interface
interface.launch(
server_name=server_name,
server_port=server_port,
share=True
)
if __name__ == "__main__":
create_interface()