-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
104 lines (90 loc) · 3.34 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
import mobi
import torch
import torchaudio
from pysbd.segmenter import Segmenter
from bs4 import BeautifulSoup
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.manage import ModelManager
from pathlib import Path
segmenter = Segmenter(clean=True)
def get_chapters(filename: str) -> list[list[str]]:
tempdir, filepath = mobi.extract(filename)
with open(filepath, 'r') as file:
content = file.read()
soup = BeautifulSoup(content, 'html.parser')
tags = soup.find_all(['p', 'br'])
text = ''
for tag in tags:
if tag.name == 'p':
text += f'{tag.text} '
elif tag.name == 'br':
text += '\n '
r = re.compile(r'Table of Contents', re.IGNORECASE)
text = r.split(text)
chapter_regex = re.compile(r'Chapter \d+', re.IGNORECASE)
chapters = chapter_regex.split(text[0])[1:]
chapters = [segmenter.segment(chapter) for chapter in chapters]
return chapters
def init_model():
manager = ModelManager()
print('Loading model:')
model_name = 'tts_models/multilingual/multi-dataset/xtts_v2'
xtts_path, _, _ = manager.download_model(model_name)
config = XttsConfig()
config.load_json(xtts_path + '/config.json')
model = Xtts.init_from_config(config)
model.load_checkpoint(config,
checkpoint_dir=xtts_path,
use_deepspeed=True if torch.cuda.is_available() else False)
if torch.cuda.is_available():
model.cuda()
return model
def merge_wav_files(wav_files, output_path):
waveforms = []
sample_rate = 0
for wav_file in wav_files:
waveform, sample_rate = torchaudio.load(wav_file)
waveforms.append(waveform)
combined_waveform = torch.cat(waveforms, dim=1)
torchaudio.save(output_path, combined_waveform, sample_rate)
def synthesize_chapter(chapter, chapter_num, gpt_cond_latent, speaker_embedding, model) -> None:
paragraphs = []
for i, sentence in enumerate(chapter):
print(len(sentence))
out = model.inference(
sentence,
'en',
gpt_cond_latent,
speaker_embedding,
temperature=0.7,
top_k=75,
top_p=0.9
)
wav_filepath = f'audio/temp/c{chapter_num + 1}_p{i + 1}.wav'
Path(wav_filepath).parent.mkdir(parents=True, exist_ok=True)
torchaudio.save(wav_filepath,
torch.tensor(out['wav']).unsqueeze(0),
24000)
paragraphs.append(wav_filepath)
res_filepath = f'audio/result/chapter_{chapter_num + 1}.wav'
merge_wav_files(paragraphs, res_filepath)
print(f'Finished chapter {chapter_num}')
def main(book_file: str, ref_files: list[str]) -> None:
chapters = get_chapters(book_file)
model = init_model()
print('Computing speaker latents...')
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=ref_files
)
for index, chapter in enumerate(chapters):
synthesize_chapter(chapter,
index,
gpt_cond_latent,
speaker_embedding,
model)
print('DONE!')
if __name__ == '__main__':
main("Harry Potter and the Sorcerer's Stone.mobi",
['audio/reference/yegor_en.wav'])