Skip to content

Commit

Permalink
latest versions of tts libs, some new parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
KoljaB committed Jul 21, 2024
1 parent a148e58 commit 6c54a14
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 10 deletions.
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,14 @@ These methods are responsible for executing the text-to-audio synthesis and play
- **Default**: `True`
- **Description**: When set to `True`, the method will prioritize speed, generating and playing sentence fragments faster. This is useful for applications where latency matters.
###### `fast_sentence_fragment_allsentences` (bool)
- **Default**: `False`
- **Description**: When set to `True`, applies the fast sentence fragment processing to all sentences, not just the first one.
###### `fast_sentence_fragment_allsentences_multiple` (bool)
- **Default**: `False`
- **Description**: When set to `True`, allows yielding multiple sentence fragments instead of just a single one.
###### `buffer_threshold_seconds` (float)
- **Default**: `0.0`
- **Description**: Specifies the time in seconds for the buffering threshold, which impacts the smoothness and continuity of audio playback.
Expand Down Expand Up @@ -453,6 +461,10 @@ These methods are responsible for executing the text-to-audio synthesis and play
- **Default**: `12`
- **Description**: The number of characters used to establish context for sentence boundary detection. A larger context improves the accuracy of detecting sentence boundaries.

###### `context_size_look_overhead` (int)
- **Default**: `12`
- **Description**: Additional context size for looking ahead when detecting sentence boundaries.

###### `muted` (bool)
- **Default**: `False`
- **Description**: If True, disables audio playback via local speakers. Useful when you want to synthesize to a file or process audio chunks without playing them.
Expand All @@ -465,8 +477,6 @@ These methods are responsible for executing the text-to-audio synthesis and play
- **Default**: `15`
- **Description**: The number of words after which the first sentence fragment is forced to be yielded.

By understanding and setting these parameters and methods appropriately, you can tailor the `TextToAudioStream` to meet the specific needs of your application.

### CUDA installation

These steps are recommended for those who require **better performance** and have a compatible NVIDIA GPU.
Expand Down
31 changes: 28 additions & 3 deletions RealtimeTTS/text_to_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ def feed(self,

def play_async(self,
fast_sentence_fragment: bool = True,
fast_sentence_fragment_allsentences: bool = True,
fast_sentence_fragment_allsentences_multiple: bool = False,
buffer_threshold_seconds: float = 0.0,
minimum_sentence_length: int = 10,
minimum_first_fragment_length: int = 10,
Expand All @@ -173,6 +175,7 @@ def play_async(self,
tokenize_sentences=None,
language: str = "",
context_size: int = 12,
context_size_look_overhead: int = 12,
muted: bool = False,
sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-",
force_first_fragment_after_words=15,
Expand All @@ -183,10 +186,10 @@ def play_async(self,
if not self.is_playing_flag:
self.is_playing_flag = True
# Pass additional parameter to differentiate external call
args = (fast_sentence_fragment, buffer_threshold_seconds, minimum_sentence_length,
args = (fast_sentence_fragment, fast_sentence_fragment_allsentences, fast_sentence_fragment_allsentences_multiple, buffer_threshold_seconds, minimum_sentence_length,
minimum_first_fragment_length, log_synthesized_text, reset_generated_text,
output_wavfile, on_sentence_synthesized, before_sentence_synthesized, on_audio_chunk, tokenizer, tokenize_sentences,
language, context_size, muted, sentence_fragment_delimiters,
language, context_size, context_size_look_overhead, muted, sentence_fragment_delimiters,
force_first_fragment_after_words, True)
self.play_thread = threading.Thread(target=self.play, args=args)
self.play_thread.start()
Expand All @@ -200,6 +203,8 @@ def play_async(self,
def play(
self,
fast_sentence_fragment: bool = True,
fast_sentence_fragment_allsentences: bool = False,
fast_sentence_fragment_allsentences_multiple: bool = False,
buffer_threshold_seconds: float = 0.0,
minimum_sentence_length: int = 10,
minimum_first_fragment_length: int = 10,
Expand All @@ -213,6 +218,7 @@ def play(
tokenize_sentences=None,
language: str = "en",
context_size: int = 12,
context_size_look_overhead: int = 12,
muted: bool = False,
sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-",
force_first_fragment_after_words=15,
Expand All @@ -225,6 +231,8 @@ def play(
Args:
- fast_sentence_fragment: Determines if sentence fragments should be quickly yielded. Useful when a faster response is desired even if a sentence isn't complete.
- fast_sentence_fragment_allsentences: Fast_sentence_fragment only works on the first sentence. Set this to True if you want to work it on every sentence.
- fast_sentence_fragment_allsentences_multiple: Can yield multiple sentence fragments, not only a single one.
- buffer_threshold_seconds (float): Time in seconds for the buffering threshold, influencing the flow and continuity of audio playback. Set to 0 to deactivate. Default is 0.
- How it Works: The system verifies whether there is more audio content in the buffer than the duration defined by buffer_threshold_seconds. If so, it proceeds to synthesize the next sentence, capitalizing on the remaining audio to maintain smooth delivery. A higher value means more audio is pre-buffered, which minimizes pauses during playback. Adjust this upwards if you encounter interruptions.
- Helps to decide when to generate more audio based on buffered content.
Expand Down Expand Up @@ -325,7 +333,24 @@ def play(
self.player.on_audio_chunk = self._on_audio_chunk

# Generate sentences from the characters
generate_sentences = s2s.generate_sentences(self.thread_safe_char_iter, context_size=context_size, minimum_sentence_length=minimum_sentence_length, minimum_first_fragment_length=minimum_first_fragment_length, quick_yield_single_sentence_fragment=fast_sentence_fragment, cleanup_text_links=True, cleanup_text_emojis=True, tokenize_sentences=tokenize_sentences, tokenizer=tokenizer, language=language, log_characters=self.log_characters, sentence_fragment_delimiters=sentence_fragment_delimiters, force_first_fragment_after_words=force_first_fragment_after_words)
generate_sentences = s2s.generate_sentences(
self.thread_safe_char_iter,
context_size=context_size,
context_size_look_overhead=context_size_look_overhead,
minimum_sentence_length=minimum_sentence_length,
minimum_first_fragment_length=minimum_first_fragment_length,
quick_yield_single_sentence_fragment=fast_sentence_fragment,
quick_yield_for_all_sentences=fast_sentence_fragment_allsentences,
quick_yield_every_fragment=fast_sentence_fragment_allsentences_multiple,
cleanup_text_links=True,
cleanup_text_emojis=True,
tokenize_sentences=tokenize_sentences,
tokenizer=tokenizer,
language=language,
log_characters=self.log_characters,
sentence_fragment_delimiters=sentence_fragment_delimiters,
force_first_fragment_after_words=force_first_fragment_after_words
)

# Create the synthesis chunk generator with the given sentences
chunk_generator = self._synthesis_chunk_generator(generate_sentences, buffer_threshold_seconds, log_synthesized_text)
Expand Down
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@ pyttsx3==2.90
azure-cognitiveservices-speech==1.38.0

# elevenlabs is for ElevenlabsEngine
elevenlabs==1.3.1
elevenlabs==1.5.0

# openai is for OpenAIEngine
openai==1.35.10
openai==1.36.1

# gtts is for GTTSEngine
gtts==2.5.1
gtts==2.5.2

# coqui_tts is for CoquiEngine
coqui_tts==0.24.1



# stream2sentence is to quickly convert streamed text into sentences for real-time synthesis
stream2sentence==0.2.3
stream2sentence==0.2.5

# pydub is used to convert chunks from mp3 to pcm (for openai tts)
pydub==0.25.1
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def parse_requirements(filename):

setuptools.setup(
name="RealTimeTTS",
version="0.4.21",
version="0.4.5",
author="Kolja Beigel",
author_email="[email protected]",
description="Stream text into audio with an easy-to-use, highly configurable library delivering voice output with minimal latency.",
Expand Down

0 comments on commit 6c54a14

Please sign in to comment.