forked from p0n1/epub_to_audiobook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
207 lines (182 loc) · 7.88 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import argparse
import logging
from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.core.audiobook_generator import AudiobookGenerator
from audiobook_generator.tts_providers.base_tts_provider import (
get_supported_tts_providers,
)
def handle_args():
parser = argparse.ArgumentParser(description="Convert text book to audiobook")
parser.add_argument("input_file", help="Path to the EPUB file")
parser.add_argument("output_folder", help="Path to the output folder")
parser.add_argument(
"--tts",
choices=get_supported_tts_providers(),
default=get_supported_tts_providers()[0],
help="Choose TTS provider (default: azure). azure: Azure Cognitive Services, openai: OpenAI TTS API. When using azure, environment variables MS_TTS_KEY and MS_TTS_REGION must be set. When using openai, environment variable OPENAI_API_KEY must be set.",
)
parser.add_argument(
"--log",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
default="INFO",
help="Log level (default: INFO), can be DEBUG, INFO, WARNING, ERROR, CRITICAL",
)
parser.add_argument(
"--preview",
action="store_true",
help="Enable preview mode. In preview mode, the script will not convert the text to speech. Instead, it will print the chapter index, titles, and character counts.",
)
parser.add_argument(
"--no_prompt",
action="store_true",
help="Don't ask the user if they wish to continue after estimating the cloud cost for TTS. Useful for scripting.",
)
parser.add_argument(
"--language",
default="en-US",
help="Language for the text-to-speech service (default: en-US). For Azure TTS (--tts=azure), check https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts#text-to-speech for supported languages. For OpenAI TTS (--tts=openai), their API detects the language automatically. But setting this will also help on splitting the text into chunks with different strategies in this tool, especially for Chinese characters. For Chinese books, use zh-CN, zh-TW, or zh-HK.",
)
parser.add_argument(
"--newline_mode",
choices=["single", "double", "none"],
default="double",
help="Choose the mode of detecting new paragraphs: 'single', 'double', or 'none'. 'single' means a single newline character, while 'double' means two consecutive newline characters. 'none' means all newline characters will be replace with blank so paragraphs will not be detected. (default: double, works for most ebooks but will detect less paragraphs for some ebooks)",
)
parser.add_argument(
"--title_mode",
choices=["auto", "tag_text", "first_few"],
default="auto",
help="Choose the parse mode for chapter title, 'tag_text' search 'title','h1','h2','h3' tag for title, 'first_few' set first 60 characters as title, 'auto' auto apply the best mode for current chapter.",
)
parser.add_argument(
"--chapter_start",
default=1,
type=int,
help="Chapter start index (default: 1, starting from 1)",
)
parser.add_argument(
"--chapter_end",
default=-1,
type=int,
help="Chapter end index (default: -1, meaning to the last chapter)",
)
parser.add_argument(
"--output_text",
action="store_true",
help="Enable Output Text. This will export a plain text file for each chapter specified and write the files to the output folder specified.",
)
parser.add_argument(
"--remove_endnotes",
action="store_true",
help="This will remove endnote numbers from the end or middle of sentences. This is useful for academic books.",
)
parser.add_argument(
"--search_and_replace_file",
default="",
help="""Path to a file that contains 1 regex replace per line, to help with fixing pronunciations, etc. The format is:
<search>==<replace>
Note that you may have to specify word boundaries, to avoid replacing parts of words.
""",
)
parser.add_argument(
"--voice_name",
help="Various TTS providers has different voice names, look up for your provider settings.",
)
parser.add_argument(
"--output_format",
help="Output format for the text-to-speech service. Supported format depends on selected TTS provider",
)
parser.add_argument(
"--model_name",
help="Various TTS providers has different neural model names",
)
edge_tts_group = parser.add_argument_group(title="edge specific")
edge_tts_group.add_argument(
"--voice_rate",
help="""
Speaking rate of the text. Valid relative values range from -50%%(--xxx='-50%%') to +100%%.
For negative value use format --arg=value,
""",
)
edge_tts_group.add_argument(
"--voice_volume",
help="""
Volume level of the speaking voice. Valid relative values floor to -100%%.
For negative value use format --arg=value,
""",
)
edge_tts_group.add_argument(
"--voice_pitch",
help="""
Baseline pitch for the text.Valid relative values like -80Hz,+50Hz, pitch changes should be within 0.5 to 1.5 times the original audio.
For negative value use format --arg=value,
""",
)
edge_tts_group.add_argument(
"--proxy",
help="Proxy server for the TTS provider. Format: http://[username:password@]proxy.server:port",
)
azure_edge_tts_group = parser.add_argument_group(title="azure/edge specific")
azure_edge_tts_group.add_argument(
"--break_duration",
default="1250",
help="Break duration in milliseconds for the different paragraphs or sections (default: 1250, means 1.25 s). Valid values range from 0 to 5000 milliseconds for Azure TTS.",
)
piper_tts_group = parser.add_argument_group(title="piper specific")
piper_tts_group.add_argument(
"--piper_path",
default="piper",
help="Path to the Piper TTS executable",
)
piper_tts_group.add_argument(
"--piper_speaker",
default=0,
help="Piper speaker id, used for multi-speaker models",
)
piper_tts_group.add_argument(
"--piper_sentence_silence",
default=0.2,
help="Seconds of silence after each sentence",
)
piper_tts_group.add_argument(
"--piper_length_scale",
default=1.0,
help="Phoneme length, a.k.a. speaking rate",
)
chattts_group = parser.add_argument_group(title="chattts specific")
chattts_group.add_argument(
"--chattts_url",
default="http://127.0.0.1:9966",
help="URL for the ChatTTS-ui server (default: http://127.0.0.1:9966)",
)
cosyvoice_group = parser.add_argument_group(title="cosyvoice specific")
cosyvoice_group.add_argument(
"--cosyvoice_url",
default="http://localhost:9880",
help="URL for the CosyVoice server (default: http://localhost:9880)",
)
cosyvoice_group.add_argument(
"--cosyvoice_speaker",
default="jok老师",
help="Speaker name for CosyVoice (default: jok老师)",
)
args = parser.parse_args()
return GeneralConfig(args)
def setup_logging(log_level):
# Create a custom formatter
formatter = logging.Formatter(
"%(asctime)s - %(filename)s:%(lineno)d - %(funcName)s - %(levelname)s - %(message)s"
)
# Create a stream handler (prints to console)
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
# Configure the root logger
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
root_logger.addHandler(console_handler)
def main():
config = handle_args()
setup_logging(config.log)
AudiobookGenerator(config).run()
if __name__ == "__main__":
main()