latest versions of tts libs, some new parameters

KoljaB · Jul 21, 2024 · 6c54a14 · 6c54a14
1 parent a148e58
commit 6c54a14
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -397,6 +397,14 @@ These methods are responsible for executing the text-to-audio synthesis and play
 - **Default**: `True`
 - **Description**: When set to `True`, the method will prioritize speed, generating and playing sentence fragments faster. This is useful for applications where latency matters.
 
+###### `fast_sentence_fragment_allsentences` (bool)
+- **Default**: `False`
+- **Description**: When set to `True`, applies the fast sentence fragment processing to all sentences, not just the first one.
+
+###### `fast_sentence_fragment_allsentences_multiple` (bool)
+- **Default**: `False`
+- **Description**: When set to `True`, allows yielding multiple sentence fragments instead of just a single one.
+
 ###### `buffer_threshold_seconds` (float)
 - **Default**: `0.0`
 - **Description**: Specifies the time in seconds for the buffering threshold, which impacts the smoothness and continuity of audio playback. 
@@ -453,6 +461,10 @@ These methods are responsible for executing the text-to-audio synthesis and play
 - **Default**: `12`
 - **Description**: The number of characters used to establish context for sentence boundary detection. A larger context improves the accuracy of detecting sentence boundaries.
 
+###### `context_size_look_overhead` (int)
+- **Default**: `12`
+- **Description**: Additional context size for looking ahead when detecting sentence boundaries.
+
 ###### `muted` (bool)
 - **Default**: `False`
 - **Description**: If True, disables audio playback via local speakers. Useful when you want to synthesize to a file or process audio chunks without playing them.
@@ -465,8 +477,6 @@ These methods are responsible for executing the text-to-audio synthesis and play
 - **Default**: `15`
 - **Description**: The number of words after which the first sentence fragment is forced to be yielded.
 
-By understanding and setting these parameters and methods appropriately, you can tailor the `TextToAudioStream` to meet the specific needs of your application.
-
 ### CUDA installation
 
 These steps are recommended for those who require **better performance** and have a compatible NVIDIA GPU.

diff --git a/RealtimeTTS/text_to_stream.py b/RealtimeTTS/text_to_stream.py
@@ -160,6 +160,8 @@ def feed(self,
 
     def play_async(self,   
                    fast_sentence_fragment: bool = True,
+                   fast_sentence_fragment_allsentences: bool = True,
+                   fast_sentence_fragment_allsentences_multiple: bool = False,
                    buffer_threshold_seconds: float = 0.0,
                    minimum_sentence_length: int = 10, 
                    minimum_first_fragment_length: int = 10,
@@ -173,6 +175,7 @@ def play_async(self,
                    tokenize_sentences=None,
                    language: str = "",
                    context_size: int = 12,
+                   context_size_look_overhead: int = 12,
                    muted: bool = False,
                    sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-",
                    force_first_fragment_after_words=15,
@@ -183,10 +186,10 @@ def play_async(self,
         if not self.is_playing_flag:
             self.is_playing_flag = True
             # Pass additional parameter to differentiate external call
-            args = (fast_sentence_fragment, buffer_threshold_seconds, minimum_sentence_length, 
+            args = (fast_sentence_fragment, fast_sentence_fragment_allsentences, fast_sentence_fragment_allsentences_multiple, buffer_threshold_seconds, minimum_sentence_length, 
                     minimum_first_fragment_length, log_synthesized_text, reset_generated_text, 
                     output_wavfile, on_sentence_synthesized, before_sentence_synthesized, on_audio_chunk, tokenizer, tokenize_sentences, 
-                    language, context_size, muted, sentence_fragment_delimiters, 
+                    language, context_size, context_size_look_overhead, muted, sentence_fragment_delimiters, 
                     force_first_fragment_after_words, True)
             self.play_thread = threading.Thread(target=self.play, args=args)
             self.play_thread.start()
@@ -200,6 +203,8 @@ def play_async(self,
     def play(
             self,
             fast_sentence_fragment: bool = True,
+            fast_sentence_fragment_allsentences: bool = False,
+            fast_sentence_fragment_allsentences_multiple: bool = False,
             buffer_threshold_seconds: float = 0.0,
             minimum_sentence_length: int = 10,
             minimum_first_fragment_length: int = 10,
@@ -213,6 +218,7 @@ def play(
             tokenize_sentences=None,
             language: str = "en",
             context_size: int = 12,
+            context_size_look_overhead: int = 12,
             muted: bool = False,
             sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-",
             force_first_fragment_after_words=15,
@@ -225,6 +231,8 @@ def play(
 
         Args:
         - fast_sentence_fragment: Determines if sentence fragments should be quickly yielded. Useful when a faster response is desired even if a sentence isn't complete.
+        - fast_sentence_fragment_allsentences: Fast_sentence_fragment only works on the first sentence. Set this to True if you want to work it on every sentence.
+        - fast_sentence_fragment_allsentences_multiple: Can yield multiple sentence fragments, not only a single one.
         - buffer_threshold_seconds (float): Time in seconds for the buffering threshold, influencing the flow and continuity of audio playback. Set to 0 to deactivate. Default is 0.
           - How it Works: The system verifies whether there is more audio content in the buffer than the duration defined by buffer_threshold_seconds. If so, it proceeds to synthesize the next sentence, capitalizing on the remaining audio to maintain smooth delivery. A higher value means more audio is pre-buffered, which minimizes pauses during playback. Adjust this upwards if you encounter interruptions.
           - Helps to decide when to generate more audio based on buffered content.
@@ -325,7 +333,24 @@ def play(
                 self.player.on_audio_chunk = self._on_audio_chunk
 
                 # Generate sentences from the characters
-                generate_sentences = s2s.generate_sentences(self.thread_safe_char_iter, context_size=context_size, minimum_sentence_length=minimum_sentence_length, minimum_first_fragment_length=minimum_first_fragment_length, quick_yield_single_sentence_fragment=fast_sentence_fragment, cleanup_text_links=True, cleanup_text_emojis=True, tokenize_sentences=tokenize_sentences, tokenizer=tokenizer, language=language, log_characters=self.log_characters, sentence_fragment_delimiters=sentence_fragment_delimiters, force_first_fragment_after_words=force_first_fragment_after_words)
+                generate_sentences = s2s.generate_sentences(
+                    self.thread_safe_char_iter,
+                    context_size=context_size,
+                    context_size_look_overhead=context_size_look_overhead,
+                    minimum_sentence_length=minimum_sentence_length,
+                    minimum_first_fragment_length=minimum_first_fragment_length,
+                    quick_yield_single_sentence_fragment=fast_sentence_fragment,
+                    quick_yield_for_all_sentences=fast_sentence_fragment_allsentences,
+                    quick_yield_every_fragment=fast_sentence_fragment_allsentences_multiple,
+                    cleanup_text_links=True,
+                    cleanup_text_emojis=True,
+                    tokenize_sentences=tokenize_sentences,
+                    tokenizer=tokenizer,
+                    language=language,
+                    log_characters=self.log_characters,
+                    sentence_fragment_delimiters=sentence_fragment_delimiters,
+                    force_first_fragment_after_words=force_first_fragment_after_words
+                )
 
                 # Create the synthesis chunk generator with the given sentences
                 chunk_generator = self._synthesis_chunk_generator(generate_sentences, buffer_threshold_seconds, log_synthesized_text)

diff --git a/requirements.txt b/requirements.txt
@@ -5,21 +5,21 @@ pyttsx3==2.90
 azure-cognitiveservices-speech==1.38.0
 
 # elevenlabs is for ElevenlabsEngine
-elevenlabs==1.3.1
+elevenlabs==1.5.0
 
 # openai is for OpenAIEngine
-openai==1.35.10
+openai==1.36.1
 
 # gtts is for GTTSEngine
-gtts==2.5.1
+gtts==2.5.2
 
 # coqui_tts is for CoquiEngine
 coqui_tts==0.24.1
 
 
 
 # stream2sentence is to quickly convert streamed text into sentences for real-time synthesis
-stream2sentence==0.2.3
+stream2sentence==0.2.5
 
 # pydub is used to convert chunks from mp3 to pcm (for openai tts)
 pydub==0.25.1

diff --git a/setup.py b/setup.py
@@ -44,7 +44,7 @@ def parse_requirements(filename):
 
 setuptools.setup(
     name="RealTimeTTS",
-    version="0.4.21",
+    version="0.4.5",
     author="Kolja Beigel",
     author_email="[email protected]",
     description="Stream text into audio with an easy-to-use, highly configurable library delivering voice output with minimal latency.",