From cb8fba703df4a731f941687387d8b2842025d128 Mon Sep 17 00:00:00 2001 From: Jarod Mica Date: Sun, 9 Jun 2024 22:44:59 -0700 Subject: [PATCH] update --- .gitignore | 3 + README.md | 5 +- audio_book_app_2_0.py => audio_book_app.py | 9 +- changelog.md | 4 + text_test1.txt | 63 +------ tortoise_api.py | 208 --------------------- 6 files changed, 18 insertions(+), 274 deletions(-) rename audio_book_app_2_0.py => audio_book_app.py (99%) delete mode 100644 tortoise_api.py diff --git a/.gitignore b/.gitignore index 4046235..416d783 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,7 @@ __pycache__/ venv/ audiobooks/ output/ +.vscode/ +tortoise_api/ + diff --git a/README.md b/README.md index 387386e..f304e99 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ There are two ways to install this, via Package or Manually. If you don't have - [ ] Highlight sentences for generation later (will need to do some type of edit to the json structure so that even if you close out, they are still highlighted) - [ ] Find a way to do "multiple speakers" for dialogue in the book (might involve a new tab where users can select sentences to regenerate) - [ ] Auto sentence regeneration and comparison using whisper (https://github.com/maxbachmann/RapidFuzz/) + - [ ] Add a toggleable option for using rvc conversion ## Prerequisites: @@ -83,7 +84,7 @@ venv\Scripts\activate ``` 4. Install pytorch using command below (recommended) or get from https://pytorch.org/get-started/locally/: -```pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117``` +```pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121``` 5. Install requirements: @@ -95,6 +96,8 @@ venv\Scripts\activate ```pip install git+https://github.com/JarodMica/rvc-tts-pipeline.git@lightweight#egg=rvc_tts_pipe``` +```pip install git+https://github.com/JarodMica/tortoise_api.git``` + 6. Download and install ffmpeg: https://ffmpeg.org/download.html - Place ffmpeg.exe and ffprobe.exe inside of audiobook_maker OR make sure they are in your environment path variable diff --git a/audio_book_app_2_0.py b/audio_book_app.py similarity index 99% rename from audio_book_app_2_0.py rename to audio_book_app.py index fbf7ffc..aefa42a 100644 --- a/audio_book_app_2_0.py +++ b/audio_book_app.py @@ -30,8 +30,8 @@ script_directory = os.path.dirname(os.path.realpath(__file__)) sys.path.append(script_directory) -from tortoise_api import Tortoise_API -from tortoise_api import load_sentences +from tortoise_api.tortoise_api import load_sentences, load_config, call_api + from rvc_pipe.rvc_infer import rvc_convert class AudioGenerationWorker(QThread): @@ -72,7 +72,6 @@ def __init__(self): self.init_ui() - self.tortoise = Tortoise_API() def init_ui(self): # Main Layout @@ -785,7 +784,9 @@ def generate_audio_for_sentence_threaded(self, directory_path, progress_callback progress_callback(progress_percentage)\ def generate_audio(self, sentence): - audio_path = self.tortoise.call_api(sentence) + tort_setup = os.path.join(script_dir, "tort.yaml") + parameters = load_config(tort_setup) + audio_path = call_api(sentence, **parameters) selected_voice = self.voice_models_combo.currentText() selected_index = self.voice_index_combo.currentText() voice_model_path = os.path.join(self.voice_folder_path, selected_voice) diff --git a/changelog.md b/changelog.md index 7330e9b..928fe85 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,9 @@ # Changelog & thoughts +# 6/9/2024 +Bug fix for tortoise TTS API call implemented, lots of things in the pipeline need a little refreshing +- Package version is not done yet. + # 10/17/2023 Bug fixes for next patch - Fixed hardcoded path in lightweight rvc package under configs.py for nvidia cards under 4GB diff --git a/text_test1.txt b/text_test1.txt index 66579c4..a8f4a5c 100644 --- a/text_test1.txt +++ b/text_test1.txt @@ -1,61 +1,2 @@ ----- Test 1 ---- -This is a simple test. It should work without any issues. --- Expected Output -- -["This is a simple test.", "It should work without any issues."] - ----- Test 2 ---- -Although I went to the store, I forgot to buy milk. Next time, I’ll make a list. --- Expected Output -- -["Although I went to the store, I forgot to buy milk.", "Next time, I’ll make a list."] - ----- Test 3 ---- -Hello World!! What's happening?? #excited. --- Expected Output -- -["Hello World!!", "What's happening??", "#excited."] - ----- Test 4 ---- -This is a weird case.. It happens sometimes.. --- Expected Output -- -["This is a weird case.", "It happens sometimes."] - ----- Test 5 ---- -I went to the store, bought milk. Then, went to the park, enjoyed the day. --- Expected Output -- -["I went to the store, bought milk.", "Then, went to the park, enjoyed the day."] - ----- Test 6 ---- - --- Expected Output -- -[] - ----- Test 7 ---- -###!!! --- Expected Output -- -[] - ----- Test 8 ---- - This is a test. - -....?????## -$$%^#$@ -!@#$!@%% -@@@ -!! -... -....////\\][[]] - -It should return two sentences. --- Expected Output -- -["This is a test.", "It should return two sentences."] - ----- Test 9 ---- -Although I went to the store, -I forgot to buy milk. -Next time, I’ll make a list. --- Expected Output -- -["Although I went to the store, I forgot to buy milk.", "Next time, I’ll make a list."] - ----- Test 10 ---- -Is this real?? Or #fantasy... Caught in a landslide, no escape... --- Expected Output -- -["Is this real??", "Or #fantasy.", "Caught in a landslide, no escape..."] \ No newline at end of file +These are the 5 BEST open source text to speech softwares that I've come across over the past year. +This here is just a quick sample of my voice with a british accent, and this is how I actually sound. \ No newline at end of file diff --git a/tortoise_api.py b/tortoise_api.py deleted file mode 100644 index 98aff78..0000000 --- a/tortoise_api.py +++ /dev/null @@ -1,208 +0,0 @@ -import requests -import concurrent.futures -from queue import Queue -import threading -import os -import sounddevice as sd -import soundfile as sf -import yaml -import re - -class Tortoise_API: - ''' - API calls to the tortoise GUI using requests. Must have an open instance of - tortoise TTS GUI running or else nothing will happen. For most cases, to use this - you need to use filter_paragraph() to splice text into a list of sentences, then - feed that list 1-by-1 into call_api. The idea is to speed up the process so that you can - generate audio while audio is being spoken - ''' - def __init__(self): - # Actually only necessary if you're using run(), could clean up code later - self.audio_queue = Queue() - self.free_slots = Queue() - self.semaphore = threading.Semaphore(1) - - def call_api(self, sentence, is_queue=False): - ''' - Makes a request to the Tortoise TTS GUI. Relies on tort.yaml, so make sure it's set-up - - Args: - sentence (str) : Text to be converted to speech - is_queue (bool) : Only set to True if using as standalone script. Uses built in queue - system to queue up 6 samples of audio to be read out loud. - - Returns: - audio_path (str) : Path of the audio to be played - ''' - tort_conf = load_config() - max_retries = 5 - - for attempt in range(max_retries): - for port in range(7860, 7866): - try: - url = f"http://127.0.0.1:{port}/run/generate" - print(f"Calling API with sentence: <{sentence}>") - response = requests.post(url, json={ - "data": [ - f"{sentence}", #prompt - tort_conf['delimiter'], #delimter - tort_conf['emotion'], #emotion - tort_conf['custom_emotion'], #custom emotion - tort_conf['voice_name'], #voice name - {"name": tort_conf['audio_file'],"data":"data:audio/wav;base64,UklGRiQAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQAAAAA="}, - tort_conf['voice_chunks'], #voice chunks - tort_conf['candidates'], #candidates - tort_conf['seed'], #seed - tort_conf['samples'], #samples - tort_conf['iterations'], #iterations - tort_conf['temperature'], #temp - tort_conf['diffusion_sampler'], - tort_conf['pause_size'], - tort_conf['cvvp_weight'], - tort_conf['top_p'], - tort_conf['diffusion_temp'], - tort_conf['length_penalty'], - tort_conf['repetition_penalty'], - tort_conf['conditioning_free_k'], - tort_conf['experimental_flags'], - False, - False, - ] - }).json() - - audio_path = response['data'][2]['choices'][0] - print(f"API response received with audio path: {audio_path}") - - if is_queue: - slot = self.free_slots.get() - self.audio_queue.put((audio_path, slot)) - else: - return audio_path - - except requests.ConnectionError: - print(f"Failed to connect to port {port}, trying next port") - except requests.Timeout: - print(f"Request timed out on port {port}, trying next port") - except requests.RequestException as e: # Catch any other requests exceptions - print(f"An error occurred on port {port}: {e}") - except Exception as e: # Catch non-requests exceptions - print(f"An unexpected error occurred: {e}") - - print(f"Attempt {attempt + 1} failed, retrying...") # Log the retry attempt - import time - # time.sleep(1) # Optional: add a delay between retries - - print(f"Failed to connect after {max_retries} attempts") - return None - - - - def play_audio_from_queue(self): - while True: - audio_file, slot = self.audio_queue.get() - if audio_file == "stop": - self.audio_queue.task_done() - break - data, sample_rate = sf.read(audio_file) - sd.play(data, sample_rate) - sd.wait() - os.remove(audio_file) - self.audio_queue.task_done() - self.free_slots.put(slot) - - # Usually only ran if using this as a standalone script, most likely you won't be - def run(self, sentences): - with concurrent.futures.ThreadPoolExecutor() as executor: - for i in range(1, 6): - self.free_slots.put(i) - - audio_thread = threading.Thread(target=self.play_audio_from_queue) - audio_thread.start() - - # Wait for each API call to complete before starting the next one - for sentence in sentences: - future = executor.submit(self.call_api, sentence) - concurrent.futures.wait([future]) - - self.audio_queue.join() - self.audio_queue.put(("stop", None)) - -def load_config(): - current_dir = os.path.dirname(os.path.abspath(__file__)) - yaml_file = os.path.join(current_dir, "tort.yaml") - - with open(yaml_file, "r") as file: - tort_conf = yaml.safe_load(file) - - return tort_conf - -import re - -def filter_paragraph(paragraph): - - import nltk - if not os.path.exists('./assets'): - os.makedirs('./assets') - nltk.download('punkt', download_dir='./assets') - nltk.data.path.append('./assets') - - # Split the paragraph into lines and process each line separately - lines = paragraph.split("\n") - - filtered_list = [] - for line in lines: - # Tokenize sentences in the current line using nltk - sentences = nltk.sent_tokenize(line.strip()) - - # Helper function to check if a sentence ends with abbreviation followed by lowercase word - def ends_with_abbreviation(sentence): - return re.search(r'\b[A-Z](?:\.[A-Z])+[\.]?$', sentence) - - i = 0 - while i < len(sentences): - # Remove square brackets and strip the sentence - line_content = re.sub(r'\[|\]', '', sentences[i]).strip() - - # Check for abbreviation and merge with the next sentence if required - if i < len(sentences) - 1 and ends_with_abbreviation(line_content) and sentences[i+1][0].islower(): - line_content += " " + sentences[i+1] - i += 1 # Skip next sentence - - # Only append lines that contain at least one alphabetic character - if line_content and any(c.isalpha() for c in line_content): - filtered_list.append(line_content) - - i += 1 - - return filtered_list - - - -def load_sentences(file_path) -> list: - ''' - Utility function for toroise to load sentences from a text file path - - Args: - file_path(str) : path to some text file - - ''' - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() - paragraphs = content.split('\n\n') # Split content into paragraphs - filtered_sentences = [] - for paragraph in paragraphs: - filtered_list = filter_paragraph(paragraph) - filtered_sentences.extend(filtered_list) - return filtered_sentences - -def read_paragraph_from_file(file_path): - with open(file_path, 'r') as file: - paragraph = file.read() - return paragraph - -if __name__ == "__main__": - file_path = "story.txt" - paragraph = read_paragraph_from_file(file_path) - filtered_paragraph = filter_paragraph(paragraph) - player = Tortoise_API() - player.run(filtered_paragraph)