diff --git a/Pipes.py b/Pipes.py index 860b4a1..93e1dd1 100644 --- a/Pipes.py +++ b/Pipes.py @@ -5,12 +5,20 @@ from ezlocalai.STT import STT from ezlocalai.CTTS import CTTS from ezlocalai.Embedding import Embedding +from ezlocalai.Helpers import chunk_content_by_tokens +from pydub import AudioSegment +from datetime import datetime +from Globals import getenv from pyngrok import ngrok import requests import base64 import pdfplumber -import torch -from Globals import getenv +import zipfile +import docx2txt +import pandas as pd +import random +import json +import io try: from ezlocalai.IMG import IMG @@ -22,6 +30,61 @@ from ezlocalai.VLM import VLM +async def file_to_text(file_path: str = ""): + """ + Learn from a file + + Args: + file_path (str, optional): Path to the file. Defaults to "". + + Returns: + str: Response from the agent + """ + file_content = "" + file_name = os.path.basename(file_path) + logging.info(f"File path: {file_path}") + file_type = file_name.split(".")[-1] + if file_type == "pdf": + with pdfplumber.open(file_path) as pdf: + content = "\n".join([page.extract_text() for page in pdf.pages]) + file_content += content + elif file_path.endswith(".zip"): + extracted_zip_folder_name = f"extracted_{file_name.replace('.zip', '_zip')}" + new_folder = os.path.join(os.path.dirname(file_path), extracted_zip_folder_name) + file_content += f"Content from the zip file uploaded named `{file_name}`:\n" + with zipfile.ZipFile(file_path, "r") as zipObj: + zipObj.extractall(path=new_folder) + # Iterate over every file that was extracted including subdirectories + for root, dirs, files in os.walk(new_folder): + for name in files: + file_content += f"Content from file uploaded named `{name}`:\n" + file_content += await file_to_text(file_path=os.path.join(root, name)) + return file_content + elif file_path.endswith(".doc") or file_path.endswith(".docx"): + file_content = docx2txt.process(file_path) + elif file_type == "csv": + with open(file_path, "r") as f: + file_content = f.read() + elif file_type == "xlsx" or file_type == "xls": + xl = pd.ExcelFile(file_path) + if len(xl.sheet_names) > 1: + sheet_count = len(xl.sheet_names) + for i, sheet_name in enumerate(xl.sheet_names, 1): + df = xl.parse(sheet_name) + csv_file_path = file_path.replace(f".{file_type}", f"_{i}.csv") + df.to_csv(csv_file_path, index=False) + else: + df = pd.read_excel(file_path) + csv_file_path = file_path.replace(f".{file_type}", ".csv") + df.to_csv(csv_file_path, index=False) + with open(csv_file_path, "r") as f: + file_content = f.read() + else: + with open(file_path, "r") as f: + file_content = f.read() + return file_content + + class Pipes: def __init__(self): load_dotenv() @@ -305,3 +368,187 @@ async def get_response(self, data, completion_type="chat"): else: response["choices"][0]["message"]["content"] += f"\n\n{generated_image}" return response, audio_response + + async def create_audiobook( + self, + content, + voice, + language="en", + ): + string_timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + output_file_name = f"audiobook_{string_timestamp}" + # Step 1: Chunk the book content into paragraphs + paragraphs = chunk_content_by_tokens(content) + + # Step 2: Extract characters, their lines, genders, and maintain order + characters = {} + ordered_content = [] + + def find_similar_character(name): + # Check for exact match first + if name in characters: + return name + # Check for case-insensitive match + lower_name = name.lower() + for char in characters: + if char.lower() == lower_name: + return char + # Check for partial matches (e.g., "Mr. Smith" vs "Smith") + for char in characters: + if name in char or char in name: + return char + return None + + for paragraph in paragraphs: + # Inject a list of characters we know so far. + prompt = f"""## Characters we know so far: +{json.dumps(characters, indent=4)} + +## Paragraph +{paragraph} + +## System +Analyze the text in the paragraph and extract: +1. All character names and their genders (male, female, or unknown.) Use best judgement based on hisortical uses of a name to determine gender. Attempt to normalize character names to match existing characters if possible. +2. Lines spoken by each character +3. Narrator lines (not spoken by any character) + +Provide the result in JSON format: +{{ + "characters": [ + {{"name": "character1", "gender": "male/female/unknown"}}, + {{"name": "character2", "gender": "male/female/unknown"}}, + ... + ], + "content": [ + {{"type": "narrator", "text": "narrator line"}}, + {{"type": "character", "name": "character1", "text": "character1 line"}}, + {{"type": "narrator", "text": "narrator line"}}, + {{"type": "character", "name": "character2", "text": "character2 line"}}, + ... + ] +}} +Ensure the content array preserves the original order of narration and dialogue.""" + + response = await self.llm.completion(prompt=prompt) + result_text = response["choices"][0]["text"] + + # Strip out code block markers if present + if "```json" in result_text: + result_text = result_text.split("```json")[1].split("```")[0] + elif "```" in result_text: + result_text = result_text.split("```")[1].split("```")[0] + + try: + result = json.loads(result_text) + for char in result.get("characters", []): + similar_char = find_similar_character(char["name"]) + if similar_char: + # Use the existing character name + char["name"] = similar_char + else: + # Add new character + characters[char["name"]] = char["gender"] + + # Update content with potentially merged character names + for item in result.get("content", []): + if item["type"] == "character": + similar_char = find_similar_character(item["name"]) + if similar_char: + item["name"] = similar_char + ordered_content.append(item) + + except json.JSONDecodeError: + logging.error(f"Failed to parse JSON from LLM response: {result_text}") + continue + + # Step 3: Translate the content if necessary + if language != "en": + translated_content = [] + for item in ordered_content: + translation_prompt = f"""## Original text:{item['text']}\n\n## System\nTranslate the original text to {language}.\nReturn only the translated text without any additional commentary.""" + translation_response = await self.llm.completion( + prompt=translation_prompt + ) + translated_text = translation_response["choices"][0]["text"].strip() + translated_item = item.copy() + translated_item["text"] = translated_text + translated_content.append(translated_item) + ordered_content = translated_content + + # Step 4: Assign voices to characters based on gender + character_voices = {} + male_voices = [f"male-{i}" for i in range(1, 101)] + female_voices = [f"female-{i}" for i in range(1, 101)] + unknown_voices = male_voices + female_voices + random.shuffle(male_voices) + random.shuffle(female_voices) + random.shuffle(unknown_voices) + + for character, gender in characters.items(): + if gender == "male" and male_voices: + character_voices[character] = male_voices.pop() + elif gender == "female" and female_voices: + character_voices[character] = female_voices.pop() + elif unknown_voices: + character_voices[character] = unknown_voices.pop() + else: + logging.warning( + f"Ran out of voices. Reusing voices for character: {character}" + ) + character_voices[character] = random.choice(male_voices + female_voices) + + # Step 5: Generate audio for each item in ordered_content + audio_segments = [] + text_output = [] + + for item in ordered_content: + if item["type"] == "narrator": + try: + audio = await self.ctts.generate( + text=item["text"], voice=voice, language=language + ) + audio_segments.append(base64.b64decode(audio)) + text_output.append(f"Narrator: {item['text']}") + except Exception as e: + logging.error( + f"Failed to generate audio for narrator text: {item['text'][:50]}... Error: {str(e)}" + ) + elif item["type"] == "character": + character_voice = character_voices.get(item["name"], voice) + try: + audio = await self.ctts.generate( + text=item["text"], voice=character_voice, language=language + ) + audio_segments.append(base64.b64decode(audio)) + text_output.append(f"{item['name']}: {item['text']}") + except Exception as e: + logging.error( + f"Failed to generate audio for character {item['name']}: {item['text'][:50]}... Error: {str(e)}" + ) + + # Step 6: Combine all audio segments + combined_audio = AudioSegment.empty() + for audio_data in audio_segments: + try: + audio = AudioSegment.from_wav(io.BytesIO(audio_data)) + combined_audio += audio + combined_audio += AudioSegment.silent( + duration=500 + ) # 0.5 second pause between segments + except Exception as e: + logging.error(f"Failed to process audio segment. Error: {str(e)}") + outputs = os.path.join(os.getcwd(), "outputs") + # Step 7: Export the final audiobook + audio_output_path = os.path.join(outputs, f"{output_file_name}.mp3") + combined_audio.export(audio_output_path, format="mp3") + + # Step 8: Save the text output + text_output_path = os.path.join(outputs, f"{output_file_name}.txt") + with open(text_output_path, "w", encoding="utf-8") as f: + f.write("\n\n".join(text_output)) + return { + "audio_file": f"{self.local_uri}/outputs/{output_file_name}.mp3", + "text_file": f"{self.local_uri}/outputs/{output_file_name}.txt", + "character_voices": character_voices, + } diff --git a/app.py b/app.py index 487ef3f..1517fd2 100644 --- a/app.py +++ b/app.py @@ -13,7 +13,7 @@ from fastapi.staticfiles import StaticFiles from pydantic import BaseModel from typing import List, Dict, Union, Optional -from Pipes import Pipes +from Pipes import Pipes, file_to_text import base64 import os import logging @@ -376,6 +376,41 @@ async def upload_voice( return {"detail": f"Voice {voice_name} has been uploaded."} +class BookToSpeech(BaseModel): + voice: Optional[str] = "default" + language: Optional[str] = "en" + + +class BookToSpeechResponse(BaseModel): + audio_file: str + text_file: str + + +@app.post( + "/v1/audio/book", + tags=["Audio"], + dependencies=[Depends(verify_api_key)], +) +async def book_to_speech( + book: BookToSpeech, + file: UploadFile = File(...), + user=Depends(verify_api_key), +): + if getenv("TTS_ENABLED").lower() == "false": + raise HTTPException(status_code=404, detail="Text to speech is disabled.") + file_type = file.filename.split(".")[-1] + file_path = os.path.join(os.getcwd(), "outputs", f"{uuid.uuid4().hex}.{file_type}") + with open(file_path, "wb") as audio_file: + audio_file.write(await file.read()) + file_content = await file_to_text(file_path=file_path) + audiobook = await pipe.create_audiobook( + voice=book.voice, language=book.language, content=file_content + ) + return BookToSpeechResponse( + audio_file=audiobook["audio_file"], text_file=audiobook["text_file"] + ) + + # Image Generation endpoint # https://platform.openai.com/docs/api-reference/images diff --git a/cuda-requirements.txt b/cuda-requirements.txt index d7aae9b..62769e2 100644 --- a/cuda-requirements.txt +++ b/cuda-requirements.txt @@ -28,4 +28,6 @@ onnx timm>=0.9.16 sentencepiece attrdict -einops \ No newline at end of file +einops +docx2txt +pandas \ No newline at end of file diff --git a/ezlocalai/Helpers.py b/ezlocalai/Helpers.py index 9527ab0..8fb6783 100644 --- a/ezlocalai/Helpers.py +++ b/ezlocalai/Helpers.py @@ -27,3 +27,62 @@ def chunk_content(text: str) -> List[str]: sentences = list(doc.sents) content_chunks = [str(sentence).strip() for sentence in sentences] return content_chunks + + +# Export chunks of paragraphs up to 2000 tokens +def chunk_content_by_tokens(text: str, max_tokens: int = 2000) -> List[str]: + # Load spaCy model + try: + nlp = spacy.load("en_core_web_sm") + except: + spacy.cli.download("en_core_web_sm") + nlp = spacy.load("en_core_web_sm") + + encoding = tiktoken.get_encoding("cl100k_base") + paragraphs = text.split("\n\n") + chunks = [] + current_chunk = [] + current_chunk_tokens = 0 + + def add_to_chunk(content: str): + nonlocal current_chunk, current_chunk_tokens, chunks + content_tokens = encoding.encode(content) + if current_chunk_tokens + len(content_tokens) > max_tokens: + chunks.append("\n\n".join(current_chunk)) + current_chunk = [] + current_chunk_tokens = 0 + current_chunk.append(content) + current_chunk_tokens += len(content_tokens) + + for paragraph in paragraphs: + paragraph_tokens = encoding.encode(paragraph) + if len(paragraph_tokens) <= max_tokens: + add_to_chunk(paragraph) + else: + # Split long paragraph into sentences using spaCy + doc = nlp(paragraph) + sentences = [sent.text for sent in doc.sents] + current_sentence_group = [] + current_group_tokens = 0 + + for sentence in sentences: + sentence_tokens = encoding.encode(sentence) + if current_group_tokens + len(sentence_tokens) <= max_tokens: + current_sentence_group.append(sentence) + current_group_tokens += len(sentence_tokens) + else: + # Add the current group of sentences as a chunk + if current_sentence_group: + add_to_chunk(" ".join(current_sentence_group)) + current_sentence_group = [sentence] + current_group_tokens = len(sentence_tokens) + + # Add any remaining sentences + if current_sentence_group: + add_to_chunk(" ".join(current_sentence_group)) + + # Add the last chunk if it's not empty + if current_chunk: + chunks.append("\n\n".join(current_chunk)) + + return chunks diff --git a/requirements.txt b/requirements.txt index d14a697..ac14aeb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,4 +25,6 @@ optimum onnx diffusers[torch] torchaudio==2.3.1 +docx2txt +pandas llama-cpp-python==0.3.7 \ No newline at end of file